Merge pull request #81 from DeBrosOfficial/cleanup/dead-code

Cleanup/dead code
This commit is contained in:
anonpenguin 2026-02-11 09:57:34 +02:00 committed by GitHub
commit 051c002ec8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
82 changed files with 7222 additions and 3324 deletions

View File

@ -1,6 +0,0 @@
# THIS IS AUTOGENERATED. DO NOT EDIT MANUALLY
version = 1
name = "network"
[setup]
script = "export MCP_BEARER_TOKEN=\"ra_9941ab97eb51668394a68963a2ab6fead0ca942afe437a6e2f4a520efcb24036\""

5
.gitignore vendored
View File

@ -101,3 +101,8 @@ vps.txt
bin-linux/
website/
terms-agreement
cli
./inspector

View File

@ -84,9 +84,9 @@ test-e2e-quick:
# Network - Distributed P2P Database System
# Makefile for development and build tasks
.PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill
.PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill redeploy-devnet redeploy-testnet release health
VERSION := 0.101.6
VERSION := 0.102.0
COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown)
DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)'
@ -196,6 +196,42 @@ stop:
kill:
@bash scripts/dev-kill-all.sh
# Deploy to devnet (build + rolling upgrade all nodes)
redeploy-devnet:
@bash scripts/redeploy.sh --devnet
# Deploy to devnet without rebuilding
redeploy-devnet-quick:
@bash scripts/redeploy.sh --devnet --no-build
# Deploy to testnet (build + rolling upgrade all nodes)
redeploy-testnet:
@bash scripts/redeploy.sh --testnet
# Deploy to testnet without rebuilding
redeploy-testnet-quick:
@bash scripts/redeploy.sh --testnet --no-build
# Interactive release workflow (tag + push)
release:
@bash scripts/release.sh
# Check health of all nodes in an environment
# Usage: make health ENV=devnet
health:
@if [ -z "$(ENV)" ]; then \
echo "Usage: make health ENV=devnet|testnet"; \
exit 1; \
fi
@while IFS='|' read -r env host pass role key; do \
[ -z "$$env" ] && continue; \
case "$$env" in \#*) continue;; esac; \
env="$$(echo "$$env" | xargs)"; \
[ "$$env" != "$(ENV)" ] && continue; \
role="$$(echo "$$role" | xargs)"; \
bash scripts/check-node-health.sh "$$host" "$$pass" "$$host ($$role)"; \
done < scripts/remote-nodes.conf
# Help
help:
@echo "Available targets:"
@ -225,6 +261,14 @@ help:
@echo " Example production test:"
@echo " ORAMA_GATEWAY_URL=https://dbrs.space make test-e2e-prod"
@echo ""
@echo "Deployment:"
@echo " make redeploy-devnet - Build + rolling deploy to all devnet nodes"
@echo " make redeploy-devnet-quick - Deploy to devnet without rebuilding"
@echo " make redeploy-testnet - Build + rolling deploy to all testnet nodes"
@echo " make redeploy-testnet-quick- Deploy to testnet without rebuilding"
@echo " make health ENV=devnet - Check health of all nodes in an environment"
@echo " make release - Interactive release workflow (tag + push)"
@echo ""
@echo "Development Management (via orama):"
@echo " ./bin/orama dev status - Show status of all dev services"
@echo " ./bin/orama dev logs <component> [--follow]"

View File

@ -88,6 +88,10 @@ func main() {
case "db":
cli.HandleDBCommand(args)
// Cluster inspection
case "inspect":
cli.HandleInspectCommand(args)
// Namespace management
case "namespace":
cli.HandleNamespaceCommand(args)
@ -173,6 +177,12 @@ func showHelp() {
fmt.Printf("🏢 Namespaces:\n")
fmt.Printf(" namespace delete - Delete current namespace and all resources\n\n")
fmt.Printf("🔍 Cluster Inspection:\n")
fmt.Printf(" inspect - Inspect cluster health via SSH\n")
fmt.Printf(" inspect --env devnet - Inspect devnet nodes\n")
fmt.Printf(" inspect --subsystem rqlite - Inspect only RQLite subsystem\n")
fmt.Printf(" inspect --format json - Output as JSON\n\n")
fmt.Printf("🌍 Environments:\n")
fmt.Printf(" env list - List all environments\n")
fmt.Printf(" env current - Show current environment\n")

View File

@ -14,10 +14,6 @@ import (
"go.uber.org/zap"
)
// For transition, alias main.GatewayConfig to pkg/gateway.Config
// server.go will be removed; this keeps compatibility until then.
type GatewayConfig = gateway.Config
func getEnvDefault(key, def string) string {
if v := os.Getenv(key); strings.TrimSpace(v) != "" {
return v

11
cmd/inspector/main.go Normal file
View File

@ -0,0 +1,11 @@
package main
import (
"os"
"github.com/DeBrosOfficial/network/pkg/cli"
)
func main() {
cli.HandleInspectCommand(os.Args[1:])
}

160
docs/COMMON_PROBLEMS.md Normal file
View File

@ -0,0 +1,160 @@
# Common Problems & Solutions
Troubleshooting guide for known issues in the Orama Network.
---
## 1. Namespace Gateway: "Olric unavailable"
**Symptom:** `ns-<name>.orama-devnet.network/v1/health` returns `"olric": {"status": "unavailable"}`.
**Cause:** The Olric memberlist gossip between namespace nodes is broken. Olric uses UDP pings for health checks — if those fail, the cluster can't bootstrap and the gateway reports Olric as unavailable.
### Check 1: WireGuard packet loss between nodes
SSH into each node and ping the other namespace nodes over WireGuard:
```bash
ping -c 10 -W 2 10.0.0.X # replace with the WG IP of each peer
```
If you see packet loss over WireGuard but **not** over the public IP (`ping <public-ip>`), the WireGuard peer session is corrupted.
**Fix — Reset the WireGuard peer on both sides:**
```bash
# On Node A — replace <pubkey> and <endpoint> with Node B's values
wg set wg0 peer <NodeB-pubkey> remove
wg set wg0 peer <NodeB-pubkey> endpoint <NodeB-public-ip>:51820 allowed-ips <NodeB-wg-ip>/32 persistent-keepalive 25
# On Node B — same but with Node A's values
wg set wg0 peer <NodeA-pubkey> remove
wg set wg0 peer <NodeA-pubkey> endpoint <NodeA-public-ip>:51820 allowed-ips <NodeA-wg-ip>/32 persistent-keepalive 25
```
Then restart services: `sudo orama prod restart`
You can find peer public keys with `wg show wg0`.
### Check 2: Olric bound to 0.0.0.0 instead of WireGuard IP
Check the Olric config on each node:
```bash
cat /home/debros/.orama/data/namespaces/<name>/configs/olric-*.yaml
```
If `bindAddr` is `0.0.0.0`, the node will try to bind to IPv6 on dual-stack hosts, breaking memberlist gossip.
**Fix:** Edit the YAML to use the node's WireGuard IP (run `ip addr show wg0` to find it), then restart: `sudo orama prod restart`
This was fixed in code (BindAddr validation in `SpawnOlric`), so new namespaces won't have this issue.
### Check 3: Olric logs show "Failed UDP ping" constantly
```bash
journalctl -u debros-namespace-olric@<name>.service --no-pager -n 30
```
If every UDP ping fails but TCP stream connections succeed, it's the WireGuard packet loss issue (see Check 1).
---
## 2. Namespace Gateway: Missing config fields
**Symptom:** Gateway config YAML is missing `global_rqlite_dsn`, has `olric_timeout: 0s`, or `olric_servers` only lists `localhost`.
**Cause:** Before the spawn handler fix, `spawnGatewayRemote()` didn't send `global_rqlite_dsn` or `olric_timeout` to remote nodes.
**Fix:** Edit the gateway config manually:
```bash
vim /home/debros/.orama/data/namespaces/<name>/configs/gateway-*.yaml
```
Add/fix:
```yaml
global_rqlite_dsn: "http://10.0.0.X:10001"
olric_timeout: 30s
olric_servers:
- "10.0.0.X:10002"
- "10.0.0.Y:10002"
- "10.0.0.Z:10002"
```
Then: `sudo orama prod restart`
This was fixed in code, so new namespaces get the correct config.
---
## 3. Namespace not restoring after restart (missing cluster-state.json)
**Symptom:** After `orama prod restart`, the namespace services don't come back because `RestoreLocalClustersFromDisk` has no state file.
**Check:**
```bash
ls /home/debros/.orama/data/namespaces/<name>/cluster-state.json
```
If the file doesn't exist, the node can't restore the namespace.
**Fix:** Create the file manually from another node that has it, or reconstruct it. The format is:
```json
{
"namespace": "<name>",
"rqlite": { "http_port": 10001, "raft_port": 10000, ... },
"olric": { "http_port": 10002, "memberlist_port": 10003, ... },
"gateway": { "http_port": 10004, ... }
}
```
This was fixed in code — `ProvisionCluster` now saves state to all nodes (including remote ones via the `save-cluster-state` spawn action).
---
## 4. Namespace gateway processes not restarting after upgrade
**Symptom:** After `orama upgrade --restart` or `orama prod restart`, namespace gateway/olric/rqlite services don't start.
**Cause:** `orama prod stop` disables systemd template services (`debros-namespace-gateway@<name>.service`). They have `PartOf=debros-node.service`, but that only propagates restart to **enabled** services.
**Fix:** Re-enable the services before restarting:
```bash
systemctl enable debros-namespace-rqlite@<name>.service
systemctl enable debros-namespace-olric@<name>.service
systemctl enable debros-namespace-gateway@<name>.service
sudo orama prod restart
```
This was fixed in code — the upgrade orchestrator now re-enables `@` services before restarting.
---
## 5. SSH commands eating stdin inside heredocs
**Symptom:** When running a script that SSHes into multiple nodes inside a heredoc (`<<'EOS'`), only the first SSH command runs — the rest are silently skipped.
**Cause:** `ssh` reads from stdin, consuming the rest of the heredoc.
**Fix:** Add `-n` flag to all `ssh` calls inside heredocs:
```bash
ssh -n user@host 'command'
```
`scp` is not affected (doesn't read stdin).
---
## General Debugging Tips
- **Always use `sudo orama prod restart`** instead of raw `systemctl` commands
- **Namespace data lives at:** `/home/debros/.orama/data/namespaces/<name>/`
- **Check service logs:** `journalctl -u debros-namespace-olric@<name>.service --no-pager -n 50`
- **Check WireGuard:** `wg show wg0` — look for recent handshakes and transfer bytes
- **Check gateway health:** `curl http://localhost:<port>/v1/health` from the node itself
- **Node IPs:** Check `scripts/remote-nodes.conf` for credentials, `wg show wg0` for WG IPs

213
docs/INSPECTOR.md Normal file
View File

@ -0,0 +1,213 @@
# Inspector
The inspector is a cluster health check tool that SSHs into every node, collects subsystem data in parallel, runs deterministic checks, and optionally sends failures to an AI model for root-cause analysis.
## Pipeline
```
Collect (parallel SSH) → Check (deterministic Go) → Report (table/JSON) → Analyze (optional AI)
```
1. **Collect** — SSH into every node in parallel, run diagnostic commands, parse results into structured data.
2. **Check** — Run pure Go check functions against the collected data. Each check produces a pass/fail/warn/skip result with a severity level.
3. **Report** — Print results as a table (default) or JSON. Failures sort first, grouped by subsystem.
4. **Analyze** — If `--ai` is enabled and there are failures or warnings, send them to an LLM via OpenRouter for root-cause analysis.
## Quick Start
```bash
# Inspect all subsystems on devnet
orama inspect --env devnet
# Inspect only RQLite
orama inspect --env devnet --subsystem rqlite
# JSON output
orama inspect --env devnet --format json
# With AI analysis
orama inspect --env devnet --ai
```
## Usage
```
orama inspect [flags]
```
| Flag | Default | Description |
|------|---------|-------------|
| `--config` | `scripts/remote-nodes.conf` | Path to node configuration file |
| `--env` | *(required)* | Environment to inspect (`devnet`, `testnet`) |
| `--subsystem` | `all` | Comma-separated subsystems to inspect |
| `--format` | `table` | Output format: `table` or `json` |
| `--timeout` | `30s` | SSH command timeout per node |
| `--verbose` | `false` | Print collection progress |
| `--ai` | `false` | Enable AI analysis of failures |
| `--model` | `moonshotai/kimi-k2.5` | OpenRouter model for AI analysis |
| `--api-key` | `$OPENROUTER_API_KEY` | OpenRouter API key |
### Subsystem Names
`rqlite`, `olric`, `ipfs`, `dns`, `wireguard` (alias: `wg`), `system`, `network`, `namespace`
Multiple subsystems can be combined: `--subsystem rqlite,olric,dns`
## Subsystems
| Subsystem | What It Checks |
|-----------|---------------|
| **rqlite** | Raft state, leader election, readyz, commit/applied gap, FSM pending, strong reads, debug vars (query errors, leader_not_found, snapshots), cross-node leader agreement, term consistency, applied index convergence, quorum, version match |
| **olric** | Service active, memberlist up, restart count, memory usage, log analysis (suspects, flapping, errors), cross-node memberlist consistency |
| **ipfs** | Daemon active, cluster active, swarm peer count, cluster peer count, cluster errors, repo usage %, swarm key present, bootstrap list empty, cross-node version consistency |
| **dns** | CoreDNS active, Caddy active, ports (53/80/443), memory, restart count, log errors, Corefile exists, SOA/NS/wildcard/base-A resolution, TLS cert expiry, cross-node nameserver availability |
| **wireguard** | Interface up, service active, correct 10.0.0.x IP, listen port 51820, peer count vs expected, MTU 1420, config exists + permissions 600, peer handshakes (fresh/stale/never), peer traffic, catch-all route detection, cross-node peer count + MTU consistency |
| **system** | Core services (debros-node, rqlite, olric, ipfs, ipfs-cluster, wg-quick), nameserver services (coredns, caddy), failed systemd units, memory/disk/inode usage, load average, OOM kills, swap, UFW active, process user (debros), panic count, expected ports |
| **network** | Internet reachability, default route, WireGuard route, TCP connection count, TIME_WAIT count, TCP retransmission rate, WireGuard mesh ping (all peers) |
| **namespace** | Per-namespace: RQLite up + raft state + readyz, Olric memberlist, Gateway HTTP health. Cross-namespace: all-healthy check, RQLite quorum per namespace |
## Severity Levels
| Level | When Used |
|-------|-----------|
| **CRITICAL** | Service completely down. Raft quorum lost, RQLite unresponsive, no leader. |
| **HIGH** | Service degraded. Olric down, gateway not responding, IPFS swarm key missing. |
| **MEDIUM** | Non-ideal but functional. Stale handshakes, elevated memory, log suspects. |
| **LOW** | Informational. Non-standard MTU, port mismatch, version skew. |
## Check Statuses
| Status | Meaning |
|--------|---------|
| **pass** | Check passed. |
| **fail** | Check failed — action needed. |
| **warn** | Degraded — monitor or investigate. |
| **skip** | Check could not run (insufficient data). |
## Output Formats
### Table (default)
```
Inspecting 14 devnet nodes...
## RQLITE
----------------------------------------------------------------------
OK [CRITICAL] RQLite responding (ubuntu@10.0.0.1)
responsive=true version=v8.36.16
FAIL [CRITICAL] Cluster has exactly one leader
leaders=0 (NO LEADER)
...
======================================================================
Summary: 800 passed, 12 failed, 31 warnings, 0 skipped (4.2s)
```
Failures sort first, then warnings, then passes. Within each group, higher severity checks appear first.
### JSON (`--format json`)
```json
{
"summary": {
"passed": 800,
"failed": 12,
"warned": 31,
"skipped": 0,
"total": 843,
"duration_seconds": 4.2
},
"checks": [
{
"id": "rqlite.responsive",
"name": "RQLite responding",
"subsystem": "rqlite",
"severity": 3,
"status": "pass",
"message": "responsive=true version=v8.36.16",
"node": "ubuntu@10.0.0.1"
}
]
}
```
## AI Analysis
When `--ai` is enabled, failures and warnings are sent to an LLM via OpenRouter for root-cause analysis.
```bash
# Use default model (kimi-k2.5)
orama inspect --env devnet --ai
# Use a different model
orama inspect --env devnet --ai --model openai/gpt-4o
# Pass API key directly
orama inspect --env devnet --ai --api-key sk-or-...
```
The API key can be set via:
1. `--api-key` flag
2. `OPENROUTER_API_KEY` environment variable
3. `.env` file in the current directory
The AI receives the full check results plus cluster metadata and returns a structured analysis with likely root causes and suggested fixes.
## Exit Codes
| Code | Meaning |
|------|---------|
| `0` | All checks passed (or only warnings). |
| `1` | At least one check failed. |
## Configuration
The inspector reads node definitions from a pipe-delimited config file (default: `scripts/remote-nodes.conf`).
### Format
```
# environment|user@host|password|role|ssh_key
devnet|ubuntu@1.2.3.4|mypassword|node|
devnet|ubuntu@5.6.7.8|mypassword|nameserver-ns1|/path/to/key
```
| Field | Description |
|-------|-------------|
| `environment` | Cluster name (`devnet`, `testnet`) |
| `user@host` | SSH credentials |
| `password` | SSH password |
| `role` | `node` or `nameserver-ns1`, `nameserver-ns2`, etc. |
| `ssh_key` | Optional path to SSH private key |
Blank lines and lines starting with `#` are ignored.
### Node Roles
- **`node`** — Regular cluster node. Runs RQLite, Olric, IPFS, WireGuard, namespaces.
- **`nameserver-*`** — DNS nameserver. Runs CoreDNS + Caddy in addition to base services. System checks verify nameserver-specific services.
## Examples
```bash
# Full cluster inspection
orama inspect --env devnet
# Check only networking
orama inspect --env devnet --subsystem wireguard,network
# Quick RQLite health check
orama inspect --env devnet --subsystem rqlite
# Verbose mode (shows collection progress)
orama inspect --env devnet --verbose
# JSON for scripting / piping
orama inspect --env devnet --format json | jq '.checks[] | select(.status == "fail")'
# AI-assisted debugging
orama inspect --env devnet --ai --model anthropic/claude-sonnet-4
# Custom config file
orama inspect --config /path/to/nodes.conf --env testnet
```

View File

@ -1,415 +0,0 @@
//go:build e2e
package cluster_test
import (
"bytes"
"context"
"fmt"
"io"
"testing"
"time"
"github.com/DeBrosOfficial/network/e2e"
"github.com/DeBrosOfficial/network/pkg/ipfs"
)
// Note: These tests connect directly to IPFS Cluster API (localhost:9094)
// and IPFS API (localhost:4501). They are for local development only.
// For production testing, use storage_http_test.go which uses gateway endpoints.
func TestIPFSCluster_Health(t *testing.T) {
e2e.SkipIfProduction(t) // Direct IPFS connection not available in production
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
logger := e2e.NewTestLogger(t)
cfg := ipfs.Config{
ClusterAPIURL: e2e.GetIPFSClusterURL(),
Timeout: 10 * time.Second,
}
client, err := ipfs.NewClient(cfg, logger)
if err != nil {
t.Fatalf("failed to create IPFS client: %v", err)
}
err = client.Health(ctx)
if err != nil {
t.Fatalf("health check failed: %v", err)
}
}
func TestIPFSCluster_GetPeerCount(t *testing.T) {
e2e.SkipIfProduction(t)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
logger := e2e.NewTestLogger(t)
cfg := ipfs.Config{
ClusterAPIURL: e2e.GetIPFSClusterURL(),
Timeout: 10 * time.Second,
}
client, err := ipfs.NewClient(cfg, logger)
if err != nil {
t.Fatalf("failed to create IPFS client: %v", err)
}
peerCount, err := client.GetPeerCount(ctx)
if err != nil {
t.Fatalf("get peer count failed: %v", err)
}
if peerCount < 0 {
t.Fatalf("expected non-negative peer count, got %d", peerCount)
}
t.Logf("IPFS cluster peers: %d", peerCount)
}
func TestIPFSCluster_AddFile(t *testing.T) {
e2e.SkipIfProduction(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
logger := e2e.NewTestLogger(t)
cfg := ipfs.Config{
ClusterAPIURL: e2e.GetIPFSClusterURL(),
Timeout: 30 * time.Second,
}
client, err := ipfs.NewClient(cfg, logger)
if err != nil {
t.Fatalf("failed to create IPFS client: %v", err)
}
content := []byte("IPFS cluster test content")
result, err := client.Add(ctx, bytes.NewReader(content), "test.txt")
if err != nil {
t.Fatalf("add file failed: %v", err)
}
if result.Cid == "" {
t.Fatalf("expected non-empty CID")
}
if result.Size != int64(len(content)) {
t.Fatalf("expected size %d, got %d", len(content), result.Size)
}
t.Logf("Added file with CID: %s", result.Cid)
}
func TestIPFSCluster_PinFile(t *testing.T) {
e2e.SkipIfProduction(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
logger := e2e.NewTestLogger(t)
cfg := ipfs.Config{
ClusterAPIURL: e2e.GetIPFSClusterURL(),
Timeout: 30 * time.Second,
}
client, err := ipfs.NewClient(cfg, logger)
if err != nil {
t.Fatalf("failed to create IPFS client: %v", err)
}
// Add file first
content := []byte("IPFS pin test content")
addResult, err := client.Add(ctx, bytes.NewReader(content), "pin-test.txt")
if err != nil {
t.Fatalf("add file failed: %v", err)
}
cid := addResult.Cid
// Pin the file
pinResult, err := client.Pin(ctx, cid, "pinned-file", 1)
if err != nil {
t.Fatalf("pin file failed: %v", err)
}
if pinResult.Cid != cid {
t.Fatalf("expected cid %s, got %s", cid, pinResult.Cid)
}
t.Logf("Pinned file: %s", cid)
}
func TestIPFSCluster_PinStatus(t *testing.T) {
e2e.SkipIfProduction(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
logger := e2e.NewTestLogger(t)
cfg := ipfs.Config{
ClusterAPIURL: e2e.GetIPFSClusterURL(),
Timeout: 30 * time.Second,
}
client, err := ipfs.NewClient(cfg, logger)
if err != nil {
t.Fatalf("failed to create IPFS client: %v", err)
}
// Add and pin file
content := []byte("IPFS status test content")
addResult, err := client.Add(ctx, bytes.NewReader(content), "status-test.txt")
if err != nil {
t.Fatalf("add file failed: %v", err)
}
cid := addResult.Cid
pinResult, err := client.Pin(ctx, cid, "status-test", 1)
if err != nil {
t.Fatalf("pin file failed: %v", err)
}
if pinResult.Cid != cid {
t.Fatalf("expected cid %s, got %s", cid, pinResult.Cid)
}
// Give pin time to propagate
e2e.Delay(1000)
// Get status
status, err := client.PinStatus(ctx, cid)
if err != nil {
t.Fatalf("get pin status failed: %v", err)
}
if status.Cid != cid {
t.Fatalf("expected cid %s, got %s", cid, status.Cid)
}
if status.Name != "status-test" {
t.Fatalf("expected name 'status-test', got %s", status.Name)
}
if status.ReplicationFactor < 1 {
t.Logf("warning: replication factor is %d, expected >= 1", status.ReplicationFactor)
}
t.Logf("Pin status: %s (replication: %d, peers: %d)", status.Status, status.ReplicationFactor, len(status.Peers))
}
func TestIPFSCluster_UnpinFile(t *testing.T) {
e2e.SkipIfProduction(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
logger := e2e.NewTestLogger(t)
cfg := ipfs.Config{
ClusterAPIURL: e2e.GetIPFSClusterURL(),
Timeout: 30 * time.Second,
}
client, err := ipfs.NewClient(cfg, logger)
if err != nil {
t.Fatalf("failed to create IPFS client: %v", err)
}
// Add and pin file
content := []byte("IPFS unpin test content")
addResult, err := client.Add(ctx, bytes.NewReader(content), "unpin-test.txt")
if err != nil {
t.Fatalf("add file failed: %v", err)
}
cid := addResult.Cid
_, err = client.Pin(ctx, cid, "unpin-test", 1)
if err != nil {
t.Fatalf("pin file failed: %v", err)
}
// Unpin file
err = client.Unpin(ctx, cid)
if err != nil {
t.Fatalf("unpin file failed: %v", err)
}
t.Logf("Unpinned file: %s", cid)
}
func TestIPFSCluster_GetFile(t *testing.T) {
e2e.SkipIfProduction(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
logger := e2e.NewTestLogger(t)
cfg := ipfs.Config{
ClusterAPIURL: e2e.GetIPFSClusterURL(),
Timeout: 30 * time.Second,
}
client, err := ipfs.NewClient(cfg, logger)
if err != nil {
t.Fatalf("failed to create IPFS client: %v", err)
}
// Add file
content := []byte("IPFS get test content")
addResult, err := client.Add(ctx, bytes.NewReader(content), "get-test.txt")
if err != nil {
t.Fatalf("add file failed: %v", err)
}
cid := addResult.Cid
// Give time for propagation
e2e.Delay(1000)
// Get file
rc, err := client.Get(ctx, cid, e2e.GetIPFSAPIURL())
if err != nil {
t.Fatalf("get file failed: %v", err)
}
defer rc.Close()
retrievedContent, err := io.ReadAll(rc)
if err != nil {
t.Fatalf("failed to read content: %v", err)
}
if !bytes.Equal(retrievedContent, content) {
t.Fatalf("content mismatch: expected %q, got %q", string(content), string(retrievedContent))
}
t.Logf("Retrieved file: %s (%d bytes)", cid, len(retrievedContent))
}
func TestIPFSCluster_LargeFile(t *testing.T) {
e2e.SkipIfProduction(t)
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
logger := e2e.NewTestLogger(t)
cfg := ipfs.Config{
ClusterAPIURL: e2e.GetIPFSClusterURL(),
Timeout: 60 * time.Second,
}
client, err := ipfs.NewClient(cfg, logger)
if err != nil {
t.Fatalf("failed to create IPFS client: %v", err)
}
// Create 5MB file
content := bytes.Repeat([]byte("x"), 5*1024*1024)
result, err := client.Add(ctx, bytes.NewReader(content), "large.bin")
if err != nil {
t.Fatalf("add large file failed: %v", err)
}
if result.Cid == "" {
t.Fatalf("expected non-empty CID")
}
if result.Size != int64(len(content)) {
t.Fatalf("expected size %d, got %d", len(content), result.Size)
}
t.Logf("Added large file with CID: %s (%d bytes)", result.Cid, result.Size)
}
func TestIPFSCluster_ReplicationFactor(t *testing.T) {
e2e.SkipIfProduction(t) // Direct IPFS connection not available in production
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
logger := e2e.NewTestLogger(t)
cfg := ipfs.Config{
ClusterAPIURL: e2e.GetIPFSClusterURL(),
Timeout: 30 * time.Second,
}
client, err := ipfs.NewClient(cfg, logger)
if err != nil {
t.Fatalf("failed to create IPFS client: %v", err)
}
// Add file
content := []byte("IPFS replication test content")
addResult, err := client.Add(ctx, bytes.NewReader(content), "replication-test.txt")
if err != nil {
t.Fatalf("add file failed: %v", err)
}
cid := addResult.Cid
// Pin with specific replication factor
replicationFactor := 2
pinResult, err := client.Pin(ctx, cid, "replication-test", replicationFactor)
if err != nil {
t.Fatalf("pin file failed: %v", err)
}
if pinResult.Cid != cid {
t.Fatalf("expected cid %s, got %s", cid, pinResult.Cid)
}
// Give time for replication
e2e.Delay(2000)
// Check status
status, err := client.PinStatus(ctx, cid)
if err != nil {
t.Fatalf("get pin status failed: %v", err)
}
t.Logf("Replication factor: requested=%d, actual=%d, peers=%d", replicationFactor, status.ReplicationFactor, len(status.Peers))
}
func TestIPFSCluster_MultipleFiles(t *testing.T) {
e2e.SkipIfProduction(t) // Direct IPFS connection not available in production
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
logger := e2e.NewTestLogger(t)
cfg := ipfs.Config{
ClusterAPIURL: e2e.GetIPFSClusterURL(),
Timeout: 30 * time.Second,
}
client, err := ipfs.NewClient(cfg, logger)
if err != nil {
t.Fatalf("failed to create IPFS client: %v", err)
}
// Add multiple files
numFiles := 5
var cids []string
for i := 0; i < numFiles; i++ {
content := []byte(fmt.Sprintf("File %d", i))
result, err := client.Add(ctx, bytes.NewReader(content), fmt.Sprintf("file%d.txt", i))
if err != nil {
t.Fatalf("add file %d failed: %v", i, err)
}
cids = append(cids, result.Cid)
}
if len(cids) != numFiles {
t.Fatalf("expected %d files added, got %d", numFiles, len(cids))
}
// Verify all files exist
for i, cid := range cids {
status, err := client.PinStatus(ctx, cid)
if err != nil {
t.Logf("warning: failed to get status for file %d: %v", i, err)
continue
}
if status.Cid != cid {
t.Fatalf("expected cid %s, got %s", cid, status.Cid)
}
}
t.Logf("Successfully added and verified %d files", numFiles)
}

View File

@ -1,296 +0,0 @@
//go:build e2e
package cluster_test
import (
"context"
"net/http"
"strings"
"testing"
"time"
"github.com/DeBrosOfficial/network/e2e"
)
func TestLibP2P_PeerConnectivity(t *testing.T) {
e2e.SkipIfMissingGateway(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
// Create and connect client
c := e2e.NewNetworkClient(t)
if err := c.Connect(); err != nil {
t.Fatalf("connect failed: %v", err)
}
defer c.Disconnect()
// Verify peer connectivity through the gateway
req := &e2e.HTTPRequest{
Method: http.MethodGet,
URL: e2e.GetGatewayURL() + "/v1/network/peers",
}
body, status, err := req.Do(ctx)
if err != nil {
t.Fatalf("peers request failed: %v", err)
}
if status != http.StatusOK {
t.Fatalf("expected status 200, got %d", status)
}
var resp map[string]interface{}
if err := e2e.DecodeJSON(body, &resp); err != nil {
t.Fatalf("failed to decode response: %v", err)
}
peers := resp["peers"].([]interface{})
if len(peers) == 0 {
t.Logf("warning: no peers connected (cluster may still be initializing)")
}
}
func TestLibP2P_BootstrapPeers(t *testing.T) {
e2e.SkipIfMissingGateway(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
bootstrapPeers := e2e.GetBootstrapPeers()
if len(bootstrapPeers) == 0 {
t.Skipf("E2E_BOOTSTRAP_PEERS not set; skipping")
}
// Create client with bootstrap peers explicitly set
c := e2e.NewNetworkClient(t)
if err := c.Connect(); err != nil {
t.Fatalf("connect failed: %v", err)
}
defer c.Disconnect()
// Give peer discovery time
e2e.Delay(2000)
// Verify we're connected (check via gateway status)
req := &e2e.HTTPRequest{
Method: http.MethodGet,
URL: e2e.GetGatewayURL() + "/v1/network/status",
}
body, status, err := req.Do(ctx)
if err != nil {
t.Fatalf("status request failed: %v", err)
}
if status != http.StatusOK {
t.Fatalf("expected status 200, got %d", status)
}
var resp map[string]interface{}
if err := e2e.DecodeJSON(body, &resp); err != nil {
t.Fatalf("failed to decode response: %v", err)
}
if resp["connected"] != true {
t.Logf("warning: client not connected to network (cluster may still be initializing)")
}
}
func TestLibP2P_MultipleClientConnections(t *testing.T) {
e2e.SkipIfMissingGateway(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
// Create multiple clients
c1 := e2e.NewNetworkClient(t)
c2 := e2e.NewNetworkClient(t)
c3 := e2e.NewNetworkClient(t)
if err := c1.Connect(); err != nil {
t.Fatalf("c1 connect failed: %v", err)
}
defer c1.Disconnect()
if err := c2.Connect(); err != nil {
t.Fatalf("c2 connect failed: %v", err)
}
defer c2.Disconnect()
if err := c3.Connect(); err != nil {
t.Fatalf("c3 connect failed: %v", err)
}
defer c3.Disconnect()
// Give peer discovery time
e2e.Delay(2000)
// Verify gateway sees multiple peers
req := &e2e.HTTPRequest{
Method: http.MethodGet,
URL: e2e.GetGatewayURL() + "/v1/network/peers",
}
body, status, err := req.Do(ctx)
if err != nil {
t.Fatalf("peers request failed: %v", err)
}
if status != http.StatusOK {
t.Fatalf("expected status 200, got %d", status)
}
var resp map[string]interface{}
if err := e2e.DecodeJSON(body, &resp); err != nil {
t.Fatalf("failed to decode response: %v", err)
}
peers := resp["peers"].([]interface{})
if len(peers) < 1 {
t.Logf("warning: expected at least 1 peer, got %d", len(peers))
}
}
func TestLibP2P_ReconnectAfterDisconnect(t *testing.T) {
e2e.SkipIfMissingGateway(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
c := e2e.NewNetworkClient(t)
// Connect
if err := c.Connect(); err != nil {
t.Fatalf("connect failed: %v", err)
}
// Verify connected via gateway
req1 := &e2e.HTTPRequest{
Method: http.MethodGet,
URL: e2e.GetGatewayURL() + "/v1/network/status",
}
_, status1, err := req1.Do(ctx)
if err != nil || status1 != http.StatusOK {
t.Logf("warning: gateway check failed before disconnect: status %d, err %v", status1, err)
}
// Disconnect
if err := c.Disconnect(); err != nil {
t.Logf("warning: disconnect failed: %v", err)
}
// Give time for disconnect to propagate
e2e.Delay(500)
// Reconnect
if err := c.Connect(); err != nil {
t.Fatalf("reconnect failed: %v", err)
}
defer c.Disconnect()
// Verify connected via gateway again
req2 := &e2e.HTTPRequest{
Method: http.MethodGet,
URL: e2e.GetGatewayURL() + "/v1/network/status",
}
_, status2, err := req2.Do(ctx)
if err != nil || status2 != http.StatusOK {
t.Logf("warning: gateway check failed after reconnect: status %d, err %v", status2, err)
}
}
func TestLibP2P_PeerDiscovery(t *testing.T) {
e2e.SkipIfMissingGateway(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
// Create client
c := e2e.NewNetworkClient(t)
if err := c.Connect(); err != nil {
t.Fatalf("connect failed: %v", err)
}
defer c.Disconnect()
// Give peer discovery time
e2e.Delay(3000)
// Get peer list
req := &e2e.HTTPRequest{
Method: http.MethodGet,
URL: e2e.GetGatewayURL() + "/v1/network/peers",
}
body, status, err := req.Do(ctx)
if err != nil {
t.Fatalf("peers request failed: %v", err)
}
if status != http.StatusOK {
t.Fatalf("expected status 200, got %d", status)
}
var resp map[string]interface{}
if err := e2e.DecodeJSON(body, &resp); err != nil {
t.Fatalf("failed to decode response: %v", err)
}
peers := resp["peers"].([]interface{})
if len(peers) == 0 {
t.Logf("warning: no peers discovered (cluster may not have multiple nodes)")
} else {
// Verify peer format (should be multiaddr strings)
for _, p := range peers {
peerStr := p.(string)
if !strings.Contains(peerStr, "/p2p/") && !strings.Contains(peerStr, "/ipfs/") {
t.Logf("warning: unexpected peer format: %s", peerStr)
}
}
}
}
func TestLibP2P_PeerAddressFormat(t *testing.T) {
e2e.SkipIfMissingGateway(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
// Create client
c := e2e.NewNetworkClient(t)
if err := c.Connect(); err != nil {
t.Fatalf("connect failed: %v", err)
}
defer c.Disconnect()
// Get peer list
req := &e2e.HTTPRequest{
Method: http.MethodGet,
URL: e2e.GetGatewayURL() + "/v1/network/peers",
}
body, status, err := req.Do(ctx)
if err != nil {
t.Fatalf("peers request failed: %v", err)
}
if status != http.StatusOK {
t.Fatalf("expected status 200, got %d", status)
}
var resp map[string]interface{}
if err := e2e.DecodeJSON(body, &resp); err != nil {
t.Fatalf("failed to decode response: %v", err)
}
peers := resp["peers"].([]interface{})
for _, p := range peers {
peerStr := p.(string)
// Multiaddrs should start with /
if !strings.HasPrefix(peerStr, "/") {
t.Fatalf("expected multiaddr format, got %s", peerStr)
}
}
}

View File

@ -1,338 +0,0 @@
//go:build e2e
package cluster_test
import (
"encoding/json"
"fmt"
"net"
"net/http"
"strings"
"testing"
"time"
"github.com/DeBrosOfficial/network/e2e"
"github.com/stretchr/testify/require"
)
// =============================================================================
// STRICT OLRIC CACHE DISTRIBUTION TESTS
// These tests verify that Olric cache data is properly distributed across nodes.
// Tests FAIL if distribution doesn't work - no skips, no warnings.
// =============================================================================
// getOlricNodeAddresses returns HTTP addresses of Olric nodes
// Note: Olric HTTP port is typically on port 3320 for the main cluster
func getOlricNodeAddresses() []string {
// In dev mode, we have a single Olric instance
// In production, each node runs its own Olric instance
return []string{
"http://localhost:3320",
}
}
// TestOlric_BasicDistribution verifies cache operations work across the cluster.
func TestOlric_BasicDistribution(t *testing.T) {
// Note: Not using SkipIfMissingGateway() since LoadTestEnv() creates its own API key
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "FAIL: Could not load test environment")
require.NotEmpty(t, env.APIKey, "FAIL: No API key available")
dmap := fmt.Sprintf("dist_test_%d", time.Now().UnixNano())
t.Run("Put_and_get_from_same_gateway", func(t *testing.T) {
key := fmt.Sprintf("key_%d", time.Now().UnixNano())
value := fmt.Sprintf("value_%d", time.Now().UnixNano())
// Put
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
require.NoError(t, err, "FAIL: Could not put value to cache")
// Get
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
require.NoError(t, err, "FAIL: Could not get value from cache")
require.Equal(t, value, retrieved, "FAIL: Retrieved value doesn't match")
t.Logf(" ✓ Put/Get works: %s = %s", key, value)
})
t.Run("Multiple_keys_distributed", func(t *testing.T) {
// Put multiple keys (should be distributed across partitions)
keys := make(map[string]string)
for i := 0; i < 20; i++ {
key := fmt.Sprintf("dist_key_%d_%d", i, time.Now().UnixNano())
value := fmt.Sprintf("dist_value_%d", i)
keys[key] = value
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
require.NoError(t, err, "FAIL: Could not put key %s", key)
}
t.Logf(" Put 20 keys to cache")
// Verify all keys are retrievable
for key, expectedValue := range keys {
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
require.NoError(t, err, "FAIL: Could not get key %s", key)
require.Equal(t, expectedValue, retrieved, "FAIL: Value mismatch for key %s", key)
}
t.Logf(" ✓ All 20 keys are retrievable")
})
}
// TestOlric_ConcurrentAccess verifies cache handles concurrent operations correctly.
func TestOlric_ConcurrentAccess(t *testing.T) {
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "FAIL: Could not load test environment")
dmap := fmt.Sprintf("concurrent_test_%d", time.Now().UnixNano())
t.Run("Concurrent_writes_to_same_key", func(t *testing.T) {
key := fmt.Sprintf("concurrent_key_%d", time.Now().UnixNano())
// Launch multiple goroutines writing to the same key
done := make(chan error, 10)
for i := 0; i < 10; i++ {
go func(idx int) {
value := fmt.Sprintf("concurrent_value_%d", idx)
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
done <- err
}(i)
}
// Wait for all writes
var errors []error
for i := 0; i < 10; i++ {
if err := <-done; err != nil {
errors = append(errors, err)
}
}
require.Empty(t, errors, "FAIL: %d concurrent writes failed: %v", len(errors), errors)
// The key should have ONE of the values (last write wins)
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
require.NoError(t, err, "FAIL: Could not get key after concurrent writes")
require.Contains(t, retrieved, "concurrent_value_", "FAIL: Value doesn't match expected pattern")
t.Logf(" ✓ Concurrent writes succeeded, final value: %s", retrieved)
})
t.Run("Concurrent_reads_and_writes", func(t *testing.T) {
key := fmt.Sprintf("rw_key_%d", time.Now().UnixNano())
initialValue := "initial_value"
// Set initial value
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, initialValue)
require.NoError(t, err, "FAIL: Could not set initial value")
// Launch concurrent readers and writers
done := make(chan error, 20)
// 10 readers
for i := 0; i < 10; i++ {
go func() {
_, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
done <- err
}()
}
// 10 writers
for i := 0; i < 10; i++ {
go func(idx int) {
value := fmt.Sprintf("updated_value_%d", idx)
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
done <- err
}(i)
}
// Wait for all operations
var readErrors, writeErrors []error
for i := 0; i < 20; i++ {
if err := <-done; err != nil {
if i < 10 {
readErrors = append(readErrors, err)
} else {
writeErrors = append(writeErrors, err)
}
}
}
require.Empty(t, readErrors, "FAIL: %d reads failed", len(readErrors))
require.Empty(t, writeErrors, "FAIL: %d writes failed", len(writeErrors))
t.Logf(" ✓ Concurrent read/write operations succeeded")
})
}
// TestOlric_NamespaceClusterCache verifies cache works in namespace-specific clusters.
func TestOlric_NamespaceClusterCache(t *testing.T) {
// Create a new namespace
namespace := fmt.Sprintf("cache-test-%d", time.Now().UnixNano())
env, err := e2e.LoadTestEnvWithNamespace(namespace)
require.NoError(t, err, "FAIL: Could not create namespace for cache test")
require.NotEmpty(t, env.APIKey, "FAIL: No API key")
t.Logf("Created namespace %s", namespace)
dmap := fmt.Sprintf("ns_cache_%d", time.Now().UnixNano())
t.Run("Cache_operations_work_in_namespace", func(t *testing.T) {
key := fmt.Sprintf("ns_key_%d", time.Now().UnixNano())
value := fmt.Sprintf("ns_value_%d", time.Now().UnixNano())
// Put using namespace API key
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
require.NoError(t, err, "FAIL: Could not put value in namespace cache")
// Get
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
require.NoError(t, err, "FAIL: Could not get value from namespace cache")
require.Equal(t, value, retrieved, "FAIL: Value mismatch in namespace cache")
t.Logf(" ✓ Namespace cache operations work: %s = %s", key, value)
})
// Check if namespace Olric instances are running (port 10003 offset in port blocks)
var nsOlricPorts []int
for port := 10003; port <= 10098; port += 5 {
conn, err := net.DialTimeout("tcp", fmt.Sprintf("localhost:%d", port), 1*time.Second)
if err == nil {
conn.Close()
nsOlricPorts = append(nsOlricPorts, port)
}
}
if len(nsOlricPorts) > 0 {
t.Logf("Found %d namespace Olric memberlist ports: %v", len(nsOlricPorts), nsOlricPorts)
t.Run("Namespace_Olric_nodes_connected", func(t *testing.T) {
// Verify all namespace Olric nodes can be reached
for _, port := range nsOlricPorts {
conn, err := net.DialTimeout("tcp", fmt.Sprintf("localhost:%d", port), 2*time.Second)
require.NoError(t, err, "FAIL: Cannot connect to namespace Olric on port %d", port)
conn.Close()
t.Logf(" ✓ Namespace Olric memberlist on port %d is reachable", port)
}
})
}
}
// TestOlric_DataConsistency verifies data remains consistent across operations.
func TestOlric_DataConsistency(t *testing.T) {
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "FAIL: Could not load test environment")
dmap := fmt.Sprintf("consistency_test_%d", time.Now().UnixNano())
t.Run("Update_preserves_latest_value", func(t *testing.T) {
key := fmt.Sprintf("update_key_%d", time.Now().UnixNano())
// Write multiple times
for i := 1; i <= 5; i++ {
value := fmt.Sprintf("version_%d", i)
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
require.NoError(t, err, "FAIL: Could not update key to version %d", i)
}
// Final read should return latest version
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
require.NoError(t, err, "FAIL: Could not read final value")
require.Equal(t, "version_5", retrieved, "FAIL: Latest version not preserved")
t.Logf(" ✓ Latest value preserved after 5 updates")
})
t.Run("Delete_removes_key", func(t *testing.T) {
key := fmt.Sprintf("delete_key_%d", time.Now().UnixNano())
value := "to_be_deleted"
// Put
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
require.NoError(t, err, "FAIL: Could not put value")
// Verify it exists
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
require.NoError(t, err, "FAIL: Could not get value before delete")
require.Equal(t, value, retrieved)
// Delete (POST with JSON body)
deleteBody := map[string]interface{}{
"dmap": dmap,
"key": key,
}
deleteBytes, _ := json.Marshal(deleteBody)
req, _ := http.NewRequest("POST", env.GatewayURL+"/v1/cache/delete", strings.NewReader(string(deleteBytes)))
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+env.APIKey)
client := &http.Client{Timeout: 10 * time.Second}
resp, err := client.Do(req)
require.NoError(t, err, "FAIL: Delete request failed")
resp.Body.Close()
require.True(t, resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusNoContent,
"FAIL: Delete returned unexpected status %d", resp.StatusCode)
// Verify key is gone
_, err = e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
require.Error(t, err, "FAIL: Key should not exist after delete")
require.Contains(t, err.Error(), "not found", "FAIL: Expected 'not found' error")
t.Logf(" ✓ Delete properly removes key")
})
}
// TestOlric_TTLExpiration verifies TTL expiration works.
// NOTE: TTL is currently parsed but not applied by the cache handler (TODO in set_handler.go).
// This test is skipped until TTL support is fully implemented.
func TestOlric_TTLExpiration(t *testing.T) {
t.Skip("TTL support not yet implemented in cache handler - see set_handler.go lines 88-98")
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "FAIL: Could not load test environment")
dmap := fmt.Sprintf("ttl_test_%d", time.Now().UnixNano())
t.Run("Key_expires_after_TTL", func(t *testing.T) {
key := fmt.Sprintf("ttl_key_%d", time.Now().UnixNano())
value := "expires_soon"
ttlSeconds := 3
// Put with TTL (TTL is a duration string like "3s", "1m", etc.)
reqBody := map[string]interface{}{
"dmap": dmap,
"key": key,
"value": value,
"ttl": fmt.Sprintf("%ds", ttlSeconds),
}
bodyBytes, _ := json.Marshal(reqBody)
req, _ := http.NewRequest("POST", env.GatewayURL+"/v1/cache/put", strings.NewReader(string(bodyBytes)))
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+env.APIKey)
client := &http.Client{Timeout: 10 * time.Second}
resp, err := client.Do(req)
require.NoError(t, err, "FAIL: Put with TTL failed")
resp.Body.Close()
require.True(t, resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusCreated,
"FAIL: Put returned status %d", resp.StatusCode)
// Verify key exists immediately
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
require.NoError(t, err, "FAIL: Could not get key immediately after put")
require.Equal(t, value, retrieved)
t.Logf(" Key exists immediately after put")
// Wait for TTL to expire (plus buffer)
time.Sleep(time.Duration(ttlSeconds+2) * time.Second)
// Key should be gone
_, err = e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
require.Error(t, err, "FAIL: Key should have expired after %d seconds", ttlSeconds)
require.Contains(t, err.Error(), "not found", "FAIL: Expected 'not found' error after TTL")
t.Logf(" ✓ Key expired after %d seconds as expected", ttlSeconds)
})
}

View File

@ -1,479 +0,0 @@
//go:build e2e
package cluster_test
import (
"context"
"fmt"
"net/http"
"sync"
"testing"
"time"
"github.com/DeBrosOfficial/network/e2e"
"github.com/stretchr/testify/require"
)
// =============================================================================
// STRICT RQLITE CLUSTER TESTS
// These tests verify that RQLite cluster operations work correctly.
// Tests FAIL if operations don't work - no skips, no warnings.
// =============================================================================
// TestRQLite_ClusterHealth verifies the RQLite cluster is healthy and operational.
func TestRQLite_ClusterHealth(t *testing.T) {
e2e.SkipIfMissingGateway(t)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
// Check RQLite schema endpoint (proves cluster is reachable)
req := &e2e.HTTPRequest{
Method: http.MethodGet,
URL: e2e.GetGatewayURL() + "/v1/rqlite/schema",
}
body, status, err := req.Do(ctx)
require.NoError(t, err, "FAIL: Could not reach RQLite cluster")
require.Equal(t, http.StatusOK, status, "FAIL: RQLite schema endpoint returned %d: %s", status, string(body))
var schemaResp map[string]interface{}
err = e2e.DecodeJSON(body, &schemaResp)
require.NoError(t, err, "FAIL: Could not decode RQLite schema response")
// Schema endpoint should return tables array
_, hasTables := schemaResp["tables"]
require.True(t, hasTables, "FAIL: RQLite schema response missing 'tables' field")
t.Logf(" ✓ RQLite cluster is healthy and responding")
}
// TestRQLite_WriteReadConsistency verifies data written can be read back consistently.
func TestRQLite_WriteReadConsistency(t *testing.T) {
e2e.SkipIfMissingGateway(t)
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()
table := e2e.GenerateTableName()
// Cleanup
defer func() {
dropReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/drop-table",
Body: map[string]interface{}{"table": table},
}
dropReq.Do(context.Background())
}()
// Create table
createReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/create-table",
Body: map[string]interface{}{
"schema": fmt.Sprintf(
"CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY, value TEXT, created_at DATETIME DEFAULT CURRENT_TIMESTAMP)",
table,
),
},
}
_, status, err := createReq.Do(ctx)
require.NoError(t, err, "FAIL: Create table request failed")
require.True(t, status == http.StatusCreated || status == http.StatusOK,
"FAIL: Create table returned status %d", status)
t.Logf("Created table %s", table)
t.Run("Write_then_read_returns_same_data", func(t *testing.T) {
uniqueValue := fmt.Sprintf("test_value_%d", time.Now().UnixNano())
// Insert
insertReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/transaction",
Body: map[string]interface{}{
"statements": []string{
fmt.Sprintf("INSERT INTO %s (value) VALUES ('%s')", table, uniqueValue),
},
},
}
_, status, err := insertReq.Do(ctx)
require.NoError(t, err, "FAIL: Insert request failed")
require.Equal(t, http.StatusOK, status, "FAIL: Insert returned status %d", status)
// Read back
queryReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/query",
Body: map[string]interface{}{
"sql": fmt.Sprintf("SELECT value FROM %s WHERE value = '%s'", table, uniqueValue),
},
}
body, status, err := queryReq.Do(ctx)
require.NoError(t, err, "FAIL: Query request failed")
require.Equal(t, http.StatusOK, status, "FAIL: Query returned status %d", status)
var queryResp map[string]interface{}
err = e2e.DecodeJSON(body, &queryResp)
require.NoError(t, err, "FAIL: Could not decode query response")
// Verify we got our value back
count, ok := queryResp["count"].(float64)
require.True(t, ok, "FAIL: Response missing 'count' field")
require.Equal(t, float64(1), count, "FAIL: Expected 1 row, got %v", count)
t.Logf(" ✓ Written value '%s' was read back correctly", uniqueValue)
})
t.Run("Multiple_writes_all_readable", func(t *testing.T) {
// Insert multiple values
var statements []string
for i := 0; i < 10; i++ {
statements = append(statements,
fmt.Sprintf("INSERT INTO %s (value) VALUES ('batch_%d')", table, i))
}
insertReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/transaction",
Body: map[string]interface{}{
"statements": statements,
},
}
_, status, err := insertReq.Do(ctx)
require.NoError(t, err, "FAIL: Batch insert failed")
require.Equal(t, http.StatusOK, status, "FAIL: Batch insert returned status %d", status)
// Count all batch rows
queryReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/query",
Body: map[string]interface{}{
"sql": fmt.Sprintf("SELECT COUNT(*) as cnt FROM %s WHERE value LIKE 'batch_%%'", table),
},
}
body, status, err := queryReq.Do(ctx)
require.NoError(t, err, "FAIL: Count query failed")
require.Equal(t, http.StatusOK, status, "FAIL: Count query returned status %d", status)
var queryResp map[string]interface{}
e2e.DecodeJSON(body, &queryResp)
if rows, ok := queryResp["rows"].([]interface{}); ok && len(rows) > 0 {
row := rows[0].([]interface{})
count := int(row[0].(float64))
require.Equal(t, 10, count, "FAIL: Expected 10 batch rows, got %d", count)
}
t.Logf(" ✓ All 10 batch writes are readable")
})
}
// TestRQLite_TransactionAtomicity verifies transactions are atomic.
func TestRQLite_TransactionAtomicity(t *testing.T) {
e2e.SkipIfMissingGateway(t)
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()
table := e2e.GenerateTableName()
// Cleanup
defer func() {
dropReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/drop-table",
Body: map[string]interface{}{"table": table},
}
dropReq.Do(context.Background())
}()
// Create table
createReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/create-table",
Body: map[string]interface{}{
"schema": fmt.Sprintf(
"CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY, value TEXT UNIQUE)",
table,
),
},
}
_, status, err := createReq.Do(ctx)
require.NoError(t, err, "FAIL: Create table failed")
require.True(t, status == http.StatusCreated || status == http.StatusOK,
"FAIL: Create table returned status %d", status)
t.Run("Successful_transaction_commits_all", func(t *testing.T) {
txReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/transaction",
Body: map[string]interface{}{
"statements": []string{
fmt.Sprintf("INSERT INTO %s (value) VALUES ('tx_val_1')", table),
fmt.Sprintf("INSERT INTO %s (value) VALUES ('tx_val_2')", table),
fmt.Sprintf("INSERT INTO %s (value) VALUES ('tx_val_3')", table),
},
},
}
_, status, err := txReq.Do(ctx)
require.NoError(t, err, "FAIL: Transaction request failed")
require.Equal(t, http.StatusOK, status, "FAIL: Transaction returned status %d", status)
// Verify all 3 rows exist
queryReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/query",
Body: map[string]interface{}{
"sql": fmt.Sprintf("SELECT COUNT(*) FROM %s WHERE value LIKE 'tx_val_%%'", table),
},
}
body, _, _ := queryReq.Do(ctx)
var queryResp map[string]interface{}
e2e.DecodeJSON(body, &queryResp)
if rows, ok := queryResp["rows"].([]interface{}); ok && len(rows) > 0 {
row := rows[0].([]interface{})
count := int(row[0].(float64))
require.Equal(t, 3, count, "FAIL: Transaction didn't commit all 3 rows - got %d", count)
}
t.Logf(" ✓ Transaction committed all 3 rows atomically")
})
t.Run("Updates_preserve_consistency", func(t *testing.T) {
// Update a value
updateReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/transaction",
Body: map[string]interface{}{
"statements": []string{
fmt.Sprintf("UPDATE %s SET value = 'tx_val_1_updated' WHERE value = 'tx_val_1'", table),
},
},
}
_, status, err := updateReq.Do(ctx)
require.NoError(t, err, "FAIL: Update request failed")
require.Equal(t, http.StatusOK, status, "FAIL: Update returned status %d", status)
// Verify update took effect
queryReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/query",
Body: map[string]interface{}{
"sql": fmt.Sprintf("SELECT value FROM %s WHERE value = 'tx_val_1_updated'", table),
},
}
body, _, _ := queryReq.Do(ctx)
var queryResp map[string]interface{}
e2e.DecodeJSON(body, &queryResp)
count, _ := queryResp["count"].(float64)
require.Equal(t, float64(1), count, "FAIL: Update didn't take effect")
t.Logf(" ✓ Update preserved consistency")
})
}
// TestRQLite_ConcurrentWrites verifies the cluster handles concurrent writes correctly.
func TestRQLite_ConcurrentWrites(t *testing.T) {
e2e.SkipIfMissingGateway(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
table := e2e.GenerateTableName()
// Cleanup
defer func() {
dropReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/drop-table",
Body: map[string]interface{}{"table": table},
}
dropReq.Do(context.Background())
}()
// Create table
createReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/create-table",
Body: map[string]interface{}{
"schema": fmt.Sprintf(
"CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY, worker INTEGER, seq INTEGER)",
table,
),
},
}
_, status, err := createReq.Do(ctx)
require.NoError(t, err, "FAIL: Create table failed")
require.True(t, status == http.StatusCreated || status == http.StatusOK,
"FAIL: Create table returned status %d", status)
t.Run("Concurrent_inserts_all_succeed", func(t *testing.T) {
numWorkers := 5
insertsPerWorker := 10
expectedTotal := numWorkers * insertsPerWorker
var wg sync.WaitGroup
errChan := make(chan error, numWorkers*insertsPerWorker)
for w := 0; w < numWorkers; w++ {
wg.Add(1)
go func(workerID int) {
defer wg.Done()
for i := 0; i < insertsPerWorker; i++ {
insertReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/transaction",
Body: map[string]interface{}{
"statements": []string{
fmt.Sprintf("INSERT INTO %s (worker, seq) VALUES (%d, %d)", table, workerID, i),
},
},
}
_, status, err := insertReq.Do(ctx)
if err != nil {
errChan <- fmt.Errorf("worker %d insert %d failed: %w", workerID, i, err)
return
}
if status != http.StatusOK {
errChan <- fmt.Errorf("worker %d insert %d got status %d", workerID, i, status)
return
}
}
}(w)
}
wg.Wait()
close(errChan)
// Collect errors
var errors []error
for err := range errChan {
errors = append(errors, err)
}
require.Empty(t, errors, "FAIL: %d concurrent inserts failed: %v", len(errors), errors)
// Verify total count
queryReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/query",
Body: map[string]interface{}{
"sql": fmt.Sprintf("SELECT COUNT(*) FROM %s", table),
},
}
body, _, _ := queryReq.Do(ctx)
var queryResp map[string]interface{}
e2e.DecodeJSON(body, &queryResp)
if rows, ok := queryResp["rows"].([]interface{}); ok && len(rows) > 0 {
row := rows[0].([]interface{})
count := int(row[0].(float64))
require.Equal(t, expectedTotal, count,
"FAIL: Expected %d total rows from concurrent inserts, got %d", expectedTotal, count)
}
t.Logf(" ✓ All %d concurrent inserts succeeded", expectedTotal)
})
}
// TestRQLite_NamespaceClusterOperations verifies RQLite works in namespace clusters.
func TestRQLite_NamespaceClusterOperations(t *testing.T) {
// Create a new namespace
namespace := fmt.Sprintf("rqlite-test-%d", time.Now().UnixNano())
env, err := e2e.LoadTestEnvWithNamespace(namespace)
require.NoError(t, err, "FAIL: Could not create namespace for RQLite test")
require.NotEmpty(t, env.APIKey, "FAIL: No API key - namespace provisioning failed")
t.Logf("Created namespace %s", namespace)
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()
table := e2e.GenerateTableName()
// Cleanup
defer func() {
dropReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: env.GatewayURL + "/v1/rqlite/drop-table",
Body: map[string]interface{}{"table": table},
Headers: map[string]string{"Authorization": "Bearer " + env.APIKey},
}
dropReq.Do(context.Background())
}()
t.Run("Namespace_RQLite_create_insert_query", func(t *testing.T) {
// Create table in namespace cluster
createReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: env.GatewayURL + "/v1/rqlite/create-table",
Headers: map[string]string{"Authorization": "Bearer " + env.APIKey},
Body: map[string]interface{}{
"schema": fmt.Sprintf(
"CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY, value TEXT)",
table,
),
},
}
_, status, err := createReq.Do(ctx)
require.NoError(t, err, "FAIL: Create table in namespace failed")
require.True(t, status == http.StatusCreated || status == http.StatusOK,
"FAIL: Create table returned status %d", status)
// Insert data
uniqueValue := fmt.Sprintf("ns_value_%d", time.Now().UnixNano())
insertReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: env.GatewayURL + "/v1/rqlite/transaction",
Headers: map[string]string{"Authorization": "Bearer " + env.APIKey},
Body: map[string]interface{}{
"statements": []string{
fmt.Sprintf("INSERT INTO %s (value) VALUES ('%s')", table, uniqueValue),
},
},
}
_, status, err = insertReq.Do(ctx)
require.NoError(t, err, "FAIL: Insert in namespace failed")
require.Equal(t, http.StatusOK, status, "FAIL: Insert returned status %d", status)
// Query data
queryReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: env.GatewayURL + "/v1/rqlite/query",
Headers: map[string]string{"Authorization": "Bearer " + env.APIKey},
Body: map[string]interface{}{
"sql": fmt.Sprintf("SELECT value FROM %s WHERE value = '%s'", table, uniqueValue),
},
}
body, status, err := queryReq.Do(ctx)
require.NoError(t, err, "FAIL: Query in namespace failed")
require.Equal(t, http.StatusOK, status, "FAIL: Query returned status %d", status)
var queryResp map[string]interface{}
e2e.DecodeJSON(body, &queryResp)
count, _ := queryResp["count"].(float64)
require.Equal(t, float64(1), count, "FAIL: Data not found in namespace cluster")
t.Logf(" ✓ Namespace RQLite operations work correctly")
})
}

View File

@ -478,11 +478,6 @@ func GetAPIKey() string {
return apiKey
}
// GetJWT returns the gateway JWT token (currently not auto-discovered)
func GetJWT() string {
return ""
}
// GetBootstrapPeers returns bootstrap peer addresses from config
func GetBootstrapPeers() []string {
cacheMutex.RLock()
@ -748,10 +743,6 @@ func NewNetworkClient(t *testing.T) client.NetworkClient {
cfg.APIKey = GetAPIKey()
cfg.QuietMode = true // Suppress debug logs in tests
if jwt := GetJWT(); jwt != "" {
cfg.JWT = jwt
}
if peers := GetBootstrapPeers(); len(peers) > 0 {
cfg.BootstrapPeers = peers
}

View File

@ -1,333 +0,0 @@
//go:build e2e && production
package production
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net"
"net/http"
"os"
"os/exec"
"path/filepath"
"testing"
"time"
"github.com/DeBrosOfficial/network/e2e"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestDNS_MultipleARecords verifies that deploying with replicas creates
// multiple A records (one per node) for DNS round-robin.
func TestDNS_MultipleARecords(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err)
if len(env.Config.Servers) < 2 {
t.Skip("Requires at least 2 servers")
}
deploymentName := fmt.Sprintf("dns-multi-%d", time.Now().Unix())
tarballPath := filepath.Join("../../testdata/apps/react-app")
deploymentID := e2e.CreateTestDeployment(t, env, deploymentName, tarballPath)
require.NotEmpty(t, deploymentID)
defer func() {
if !env.SkipCleanup {
e2e.DeleteDeployment(t, env, deploymentID)
}
}()
// Wait for replica setup and DNS propagation
time.Sleep(15 * time.Second)
t.Run("DNS returns multiple IPs", func(t *testing.T) {
deployment := e2e.GetDeployment(t, env, deploymentID)
subdomain, _ := deployment["subdomain"].(string)
if subdomain == "" {
subdomain = deploymentName
}
fqdn := fmt.Sprintf("%s.%s", subdomain, env.BaseDomain)
// Query nameserver directly
nameserverIP := env.Config.Servers[0].IP
resolver := &net.Resolver{
PreferGo: true,
Dial: func(ctx context.Context, network, address string) (net.Conn, error) {
d := net.Dialer{Timeout: 10 * time.Second}
return d.Dial("udp", nameserverIP+":53")
},
}
ctx := context.Background()
ips, err := resolver.LookupHost(ctx, fqdn)
if err != nil {
t.Logf("DNS lookup failed for %s: %v", fqdn, err)
t.Log("Trying net.LookupHost instead...")
ips, err = net.LookupHost(fqdn)
}
if err != nil {
t.Logf("DNS lookup failed: %v (DNS may not be propagated yet)", err)
t.Skip("DNS not yet propagated")
}
t.Logf("DNS returned %d IPs for %s: %v", len(ips), fqdn, ips)
assert.GreaterOrEqual(t, len(ips), 2,
"Should have at least 2 A records (home + replica)")
// Verify returned IPs are from our server list
serverIPs := e2e.GetServerIPs(env.Config)
for _, ip := range ips {
assert.Contains(t, serverIPs, ip,
"DNS IP %s should be one of our servers", ip)
}
})
}
// TestDNS_CleanupOnDelete verifies that deleting a deployment removes all
// DNS records (both home and replica A records).
func TestDNS_CleanupOnDelete(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err)
deploymentName := fmt.Sprintf("dns-cleanup-%d", time.Now().Unix())
tarballPath := filepath.Join("../../testdata/apps/react-app")
deploymentID := e2e.CreateTestDeployment(t, env, deploymentName, tarballPath)
require.NotEmpty(t, deploymentID)
// Wait for DNS
time.Sleep(10 * time.Second)
// Get subdomain before deletion
deployment := e2e.GetDeployment(t, env, deploymentID)
subdomain, _ := deployment["subdomain"].(string)
if subdomain == "" {
subdomain = deploymentName
}
fqdn := fmt.Sprintf("%s.%s", subdomain, env.BaseDomain)
// Verify DNS works before deletion
t.Run("DNS resolves before deletion", func(t *testing.T) {
nodeURL := extractNodeURLProd(t, deployment)
if nodeURL == "" {
t.Skip("No URL to test")
}
domain := extractDomainProd(nodeURL)
req, _ := http.NewRequest("GET", env.GatewayURL+"/", nil)
req.Host = domain
resp, err := env.HTTPClient.Do(req)
if err == nil {
resp.Body.Close()
t.Logf("Pre-delete: status=%d", resp.StatusCode)
}
})
// Delete
e2e.DeleteDeployment(t, env, deploymentID)
time.Sleep(10 * time.Second)
t.Run("DNS records removed after deletion", func(t *testing.T) {
ips, err := net.LookupHost(fqdn)
if err != nil {
t.Logf("DNS lookup failed (expected): %v", err)
return // Good — no records
}
// If we still get IPs, they might be cached. Log and warn.
if len(ips) > 0 {
t.Logf("WARNING: DNS still returns %d IPs after deletion (may be cached): %v", len(ips), ips)
}
})
}
// TestDNS_CustomSubdomain verifies that deploying with a custom subdomain
// creates DNS records using the custom name.
func TestDNS_CustomSubdomain(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err)
deploymentName := fmt.Sprintf("dns-custom-%d", time.Now().Unix())
tarballPath := filepath.Join("../../testdata/apps/react-app")
deploymentID := createDeploymentWithSubdomain(t, env, deploymentName, tarballPath)
require.NotEmpty(t, deploymentID)
defer func() {
if !env.SkipCleanup {
e2e.DeleteDeployment(t, env, deploymentID)
}
}()
time.Sleep(10 * time.Second)
t.Run("Deployment has subdomain with random suffix", func(t *testing.T) {
deployment := e2e.GetDeployment(t, env, deploymentID)
subdomain, _ := deployment["subdomain"].(string)
require.NotEmpty(t, subdomain, "Deployment should have a subdomain")
t.Logf("Subdomain: %s", subdomain)
// Verify the subdomain starts with the deployment name
assert.Contains(t, subdomain, deploymentName[:10],
"Subdomain should relate to deployment name")
})
}
// TestDNS_RedeployPreservesSubdomain verifies that updating a deployment
// does not change the subdomain/DNS.
func TestDNS_RedeployPreservesSubdomain(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err)
deploymentName := fmt.Sprintf("dns-preserve-%d", time.Now().Unix())
tarballPath := filepath.Join("../../testdata/apps/react-app")
deploymentID := e2e.CreateTestDeployment(t, env, deploymentName, tarballPath)
require.NotEmpty(t, deploymentID)
defer func() {
if !env.SkipCleanup {
e2e.DeleteDeployment(t, env, deploymentID)
}
}()
time.Sleep(5 * time.Second)
// Get original subdomain
deployment := e2e.GetDeployment(t, env, deploymentID)
originalSubdomain, _ := deployment["subdomain"].(string)
originalURLs := deployment["urls"]
t.Logf("Original subdomain: %s, urls: %v", originalSubdomain, originalURLs)
// Update
updateStaticDeploymentProd(t, env, deploymentName, tarballPath)
time.Sleep(5 * time.Second)
// Verify subdomain unchanged
t.Run("Subdomain unchanged after update", func(t *testing.T) {
updated := e2e.GetDeployment(t, env, deploymentID)
updatedSubdomain, _ := updated["subdomain"].(string)
assert.Equal(t, originalSubdomain, updatedSubdomain,
"Subdomain should not change after update")
t.Logf("After update: subdomain=%s", updatedSubdomain)
})
}
func createDeploymentWithSubdomain(t *testing.T, env *e2e.E2ETestEnv, name, tarballPath string) string {
t.Helper()
var fileData []byte
info, err := os.Stat(tarballPath)
require.NoError(t, err)
if info.IsDir() {
fileData, err = exec.Command("tar", "-czf", "-", "-C", tarballPath, ".").Output()
require.NoError(t, err)
} else {
file, err := os.Open(tarballPath)
require.NoError(t, err)
defer file.Close()
fileData, _ = io.ReadAll(file)
}
body := &bytes.Buffer{}
boundary := "----WebKitFormBoundary7MA4YWxkTrZu0gW"
body.WriteString("--" + boundary + "\r\n")
body.WriteString("Content-Disposition: form-data; name=\"name\"\r\n\r\n")
body.WriteString(name + "\r\n")
body.WriteString("--" + boundary + "\r\n")
body.WriteString("Content-Disposition: form-data; name=\"tarball\"; filename=\"app.tar.gz\"\r\n")
body.WriteString("Content-Type: application/gzip\r\n\r\n")
body.Write(fileData)
body.WriteString("\r\n--" + boundary + "--\r\n")
req, err := http.NewRequest("POST", env.GatewayURL+"/v1/deployments/static/upload", body)
require.NoError(t, err)
req.Header.Set("Content-Type", "multipart/form-data; boundary="+boundary)
req.Header.Set("Authorization", "Bearer "+env.APIKey)
resp, err := env.HTTPClient.Do(req)
require.NoError(t, err)
defer resp.Body.Close()
if resp.StatusCode != http.StatusCreated {
bodyBytes, _ := io.ReadAll(resp.Body)
t.Fatalf("Upload failed: status=%d body=%s", resp.StatusCode, string(bodyBytes))
}
var result map[string]interface{}
json.NewDecoder(resp.Body).Decode(&result)
if id, ok := result["deployment_id"].(string); ok {
return id
}
if id, ok := result["id"].(string); ok {
return id
}
t.Fatalf("No id in response: %+v", result)
return ""
}
func updateStaticDeploymentProd(t *testing.T, env *e2e.E2ETestEnv, name, tarballPath string) {
t.Helper()
var fileData []byte
info, err := os.Stat(tarballPath)
require.NoError(t, err)
if info.IsDir() {
fileData, err = exec.Command("tar", "-czf", "-", "-C", tarballPath, ".").Output()
require.NoError(t, err)
} else {
file, err := os.Open(tarballPath)
require.NoError(t, err)
defer file.Close()
fileData, _ = io.ReadAll(file)
}
body := &bytes.Buffer{}
boundary := "----WebKitFormBoundary7MA4YWxkTrZu0gW"
body.WriteString("--" + boundary + "\r\n")
body.WriteString("Content-Disposition: form-data; name=\"name\"\r\n\r\n")
body.WriteString(name + "\r\n")
body.WriteString("--" + boundary + "\r\n")
body.WriteString("Content-Disposition: form-data; name=\"tarball\"; filename=\"app.tar.gz\"\r\n")
body.WriteString("Content-Type: application/gzip\r\n\r\n")
body.Write(fileData)
body.WriteString("\r\n--" + boundary + "--\r\n")
req, err := http.NewRequest("POST", env.GatewayURL+"/v1/deployments/static/update", body)
require.NoError(t, err)
req.Header.Set("Content-Type", "multipart/form-data; boundary="+boundary)
req.Header.Set("Authorization", "Bearer "+env.APIKey)
resp, err := env.HTTPClient.Do(req)
require.NoError(t, err)
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
bodyBytes, _ := io.ReadAll(resp.Body)
t.Fatalf("Update failed: status=%d body=%s", resp.StatusCode, string(bodyBytes))
}
}

View File

@ -1,121 +0,0 @@
//go:build e2e && production
package production
import (
"context"
"fmt"
"net"
"path/filepath"
"testing"
"time"
"github.com/DeBrosOfficial/network/e2e"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestDNS_DeploymentResolution tests that deployed applications are resolvable via DNS
// This test requires production mode as it performs real DNS lookups
func TestDNS_DeploymentResolution(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "Failed to load test environment")
deploymentName := fmt.Sprintf("dns-test-%d", time.Now().Unix())
tarballPath := filepath.Join("../../testdata/apps/react-app")
deploymentID := e2e.CreateTestDeployment(t, env, deploymentName, tarballPath)
defer func() {
if !env.SkipCleanup {
e2e.DeleteDeployment(t, env, deploymentID)
}
}()
// Wait for DNS propagation
domain := env.BuildDeploymentDomain(deploymentName)
t.Logf("Testing DNS resolution for: %s", domain)
t.Run("DNS resolves to valid server IP", func(t *testing.T) {
// Allow some time for DNS propagation
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
var ips []string
var err error
// Poll for DNS resolution
for {
select {
case <-ctx.Done():
t.Fatalf("DNS resolution timeout for %s", domain)
default:
ips, err = net.LookupHost(domain)
if err == nil && len(ips) > 0 {
goto resolved
}
time.Sleep(2 * time.Second)
}
}
resolved:
t.Logf("DNS resolved: %s -> %v", domain, ips)
assert.NotEmpty(t, ips, "Should have IP addresses")
// Verify resolved IP is one of our servers
validIPs := e2e.GetServerIPs(env.Config)
if len(validIPs) > 0 {
found := false
for _, ip := range ips {
for _, validIP := range validIPs {
if ip == validIP {
found = true
break
}
}
}
assert.True(t, found, "Resolved IP should be one of our servers: %v (valid: %v)", ips, validIPs)
}
})
}
// TestDNS_BaseDomainResolution tests that the base domain resolves correctly
func TestDNS_BaseDomainResolution(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "Failed to load test environment")
t.Run("Base domain resolves", func(t *testing.T) {
ips, err := net.LookupHost(env.BaseDomain)
require.NoError(t, err, "Base domain %s should resolve", env.BaseDomain)
assert.NotEmpty(t, ips, "Should have IP addresses")
t.Logf("✓ Base domain %s resolves to: %v", env.BaseDomain, ips)
})
}
// TestDNS_WildcardResolution tests wildcard DNS for arbitrary subdomains
func TestDNS_WildcardResolution(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "Failed to load test environment")
t.Run("Wildcard subdomain resolves", func(t *testing.T) {
// Test with a random subdomain that doesn't exist as a deployment
randomSubdomain := fmt.Sprintf("random-test-%d.%s", time.Now().UnixNano(), env.BaseDomain)
ips, err := net.LookupHost(randomSubdomain)
if err != nil {
// DNS may not support wildcard - that's OK for some setups
t.Logf("⚠ Wildcard DNS not configured (this may be expected): %v", err)
t.Skip("Wildcard DNS not configured")
return
}
assert.NotEmpty(t, ips, "Wildcard subdomain should resolve")
t.Logf("✓ Wildcard subdomain resolves: %s -> %v", randomSubdomain, ips)
})
}

View File

@ -1,181 +0,0 @@
//go:build e2e && production
package production
import (
"context"
"net"
"strings"
"testing"
"time"
"github.com/DeBrosOfficial/network/e2e"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestNameserver_NSRecords tests that NS records are properly configured for the domain
func TestNameserver_NSRecords(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "Failed to load test environment")
if len(env.Config.Nameservers) == 0 {
t.Skip("No nameservers configured in e2e/config.yaml")
}
t.Run("NS records exist for base domain", func(t *testing.T) {
nsRecords, err := net.LookupNS(env.BaseDomain)
require.NoError(t, err, "Should be able to look up NS records for %s", env.BaseDomain)
require.NotEmpty(t, nsRecords, "Should have NS records")
t.Logf("Found %d NS records for %s:", len(nsRecords), env.BaseDomain)
for _, ns := range nsRecords {
t.Logf(" - %s", ns.Host)
}
// Verify our nameservers are listed
for _, expected := range env.Config.Nameservers {
found := false
for _, ns := range nsRecords {
// Trim trailing dot for comparison
nsHost := strings.TrimSuffix(ns.Host, ".")
if nsHost == expected || nsHost == expected+"." {
found = true
break
}
}
assert.True(t, found, "NS records should include %s", expected)
}
})
}
// TestNameserver_GlueRecords tests that glue records point to correct IPs
func TestNameserver_GlueRecords(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "Failed to load test environment")
if len(env.Config.Nameservers) == 0 {
t.Skip("No nameservers configured in e2e/config.yaml")
}
nameserverServers := e2e.GetNameserverServers(env.Config)
if len(nameserverServers) == 0 {
t.Skip("No servers marked as nameservers in config")
}
t.Run("Glue records resolve to correct IPs", func(t *testing.T) {
for i, ns := range env.Config.Nameservers {
ips, err := net.LookupHost(ns)
require.NoError(t, err, "Nameserver %s should resolve", ns)
require.NotEmpty(t, ips, "Nameserver %s should have IP addresses", ns)
t.Logf("Nameserver %s resolves to: %v", ns, ips)
// If we have the expected IP, verify it matches
if i < len(nameserverServers) {
expectedIP := nameserverServers[i].IP
found := false
for _, ip := range ips {
if ip == expectedIP {
found = true
break
}
}
assert.True(t, found, "Glue record for %s should point to %s (got %v)", ns, expectedIP, ips)
}
}
})
}
// TestNameserver_CoreDNSResponds tests that our CoreDNS servers respond to queries
func TestNameserver_CoreDNSResponds(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "Failed to load test environment")
nameserverServers := e2e.GetNameserverServers(env.Config)
if len(nameserverServers) == 0 {
t.Skip("No servers marked as nameservers in config")
}
t.Run("CoreDNS servers respond to queries", func(t *testing.T) {
for _, server := range nameserverServers {
t.Run(server.Name, func(t *testing.T) {
// Create a custom resolver that queries this specific server
resolver := &net.Resolver{
PreferGo: true,
Dial: func(ctx context.Context, network, address string) (net.Conn, error) {
d := net.Dialer{
Timeout: 5 * time.Second,
}
return d.DialContext(ctx, "udp", server.IP+":53")
},
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
// Query the base domain
ips, err := resolver.LookupHost(ctx, env.BaseDomain)
if err != nil {
// Log the error but don't fail - server might be configured differently
t.Logf("⚠ CoreDNS at %s (%s) query error: %v", server.Name, server.IP, err)
return
}
t.Logf("✓ CoreDNS at %s (%s) responded: %s -> %v", server.Name, server.IP, env.BaseDomain, ips)
assert.NotEmpty(t, ips, "CoreDNS should return IP addresses")
})
}
})
}
// TestNameserver_QueryLatency tests DNS query latency from our nameservers
func TestNameserver_QueryLatency(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "Failed to load test environment")
nameserverServers := e2e.GetNameserverServers(env.Config)
if len(nameserverServers) == 0 {
t.Skip("No servers marked as nameservers in config")
}
t.Run("DNS query latency is acceptable", func(t *testing.T) {
for _, server := range nameserverServers {
resolver := &net.Resolver{
PreferGo: true,
Dial: func(ctx context.Context, network, address string) (net.Conn, error) {
d := net.Dialer{
Timeout: 5 * time.Second,
}
return d.DialContext(ctx, "udp", server.IP+":53")
},
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
start := time.Now()
_, err := resolver.LookupHost(ctx, env.BaseDomain)
latency := time.Since(start)
if err != nil {
t.Logf("⚠ Query to %s failed: %v", server.Name, err)
continue
}
t.Logf("DNS latency from %s (%s): %v", server.Name, server.IP, latency)
// DNS queries should be fast (under 500ms is reasonable)
assert.Less(t, latency, 500*time.Millisecond,
"DNS query to %s should complete in under 500ms", server.Name)
}
})
}

BIN
inspector Executable file

Binary file not shown.

158
pkg/cli/inspect_command.go Normal file
View File

@ -0,0 +1,158 @@
package cli
import (
"bufio"
"context"
"flag"
"fmt"
"os"
"strings"
"time"
"github.com/DeBrosOfficial/network/pkg/inspector"
// Import checks package so init() registers the checkers
_ "github.com/DeBrosOfficial/network/pkg/inspector/checks"
)
// loadDotEnv loads key=value pairs from a .env file into os environment.
// Only sets vars that are not already set (env takes precedence over file).
func loadDotEnv(path string) {
f, err := os.Open(path)
if err != nil {
return // .env is optional
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" || strings.HasPrefix(line, "#") {
continue
}
eq := strings.IndexByte(line, '=')
if eq < 1 {
continue
}
key := line[:eq]
value := line[eq+1:]
// Only set if not already in environment
if os.Getenv(key) == "" {
os.Setenv(key, value)
}
}
}
// HandleInspectCommand handles the "orama inspect" command.
func HandleInspectCommand(args []string) {
// Load .env file from current directory (only sets unset vars)
loadDotEnv(".env")
fs := flag.NewFlagSet("inspect", flag.ExitOnError)
configPath := fs.String("config", "scripts/remote-nodes.conf", "Path to remote-nodes.conf")
env := fs.String("env", "", "Environment to inspect (devnet, testnet)")
subsystem := fs.String("subsystem", "all", "Subsystem to inspect (rqlite,olric,ipfs,dns,wg,system,network,all)")
format := fs.String("format", "table", "Output format (table, json)")
timeout := fs.Duration("timeout", 30*time.Second, "SSH command timeout")
verbose := fs.Bool("verbose", false, "Verbose output")
// AI flags
aiEnabled := fs.Bool("ai", false, "Enable AI analysis of failures")
aiModel := fs.String("model", "moonshotai/kimi-k2.5", "OpenRouter model for AI analysis")
aiAPIKey := fs.String("api-key", "", "OpenRouter API key (or OPENROUTER_API_KEY env)")
fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: orama inspect [flags]\n\n")
fmt.Fprintf(os.Stderr, "Inspect cluster health by SSHing into nodes and running checks.\n\n")
fmt.Fprintf(os.Stderr, "Flags:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " orama inspect --env devnet\n")
fmt.Fprintf(os.Stderr, " orama inspect --env devnet --subsystem rqlite\n")
fmt.Fprintf(os.Stderr, " orama inspect --env devnet --ai\n")
fmt.Fprintf(os.Stderr, " orama inspect --env devnet --ai --model openai/gpt-4o\n")
}
if err := fs.Parse(args); err != nil {
os.Exit(1)
}
if *env == "" {
fmt.Fprintf(os.Stderr, "Error: --env is required (devnet, testnet)\n")
os.Exit(1)
}
// Load nodes
nodes, err := inspector.LoadNodes(*configPath)
if err != nil {
fmt.Fprintf(os.Stderr, "Error loading config: %v\n", err)
os.Exit(1)
}
// Filter by environment
nodes = inspector.FilterByEnv(nodes, *env)
if len(nodes) == 0 {
fmt.Fprintf(os.Stderr, "Error: no nodes found for environment %q\n", *env)
os.Exit(1)
}
// Parse subsystems
var subsystems []string
if *subsystem != "all" {
subsystems = strings.Split(*subsystem, ",")
}
fmt.Printf("Inspecting %d %s nodes", len(nodes), *env)
if len(subsystems) > 0 {
fmt.Printf(" [%s]", strings.Join(subsystems, ","))
}
if *aiEnabled {
fmt.Printf(" (AI: %s)", *aiModel)
}
fmt.Printf("...\n\n")
// Phase 1: Collect
ctx, cancel := context.WithTimeout(context.Background(), *timeout+10*time.Second)
defer cancel()
if *verbose {
fmt.Printf("Collecting data from %d nodes (timeout: %s)...\n", len(nodes), timeout)
}
data := inspector.Collect(ctx, nodes, subsystems, *verbose)
if *verbose {
fmt.Printf("Collection complete in %.1fs\n\n", data.Duration.Seconds())
}
// Phase 2: Check
results := inspector.RunChecks(data, subsystems)
// Phase 3: Report
switch *format {
case "json":
inspector.PrintJSON(results, os.Stdout)
default:
inspector.PrintTable(results, os.Stdout)
}
// Phase 4: AI Analysis (if enabled and there are failures or warnings)
if *aiEnabled {
issues := results.FailuresAndWarnings()
if len(issues) == 0 {
fmt.Printf("\nAll checks passed — no AI analysis needed.\n")
} else {
fmt.Printf("\nAnalyzing %d issues with %s...\n", len(issues), *aiModel)
analysis, err := inspector.Analyze(results, data, *aiModel, *aiAPIKey)
if err != nil {
fmt.Fprintf(os.Stderr, "\nAI analysis failed: %v\n", err)
} else {
inspector.PrintAnalysis(analysis, os.Stdout)
}
}
}
// Exit with non-zero if any failures
if failures := results.Failures(); len(failures) > 0 {
os.Exit(1)
}
}

View File

@ -53,13 +53,17 @@ func HandleStop() {
// Reset failed state for any services that might be in failed state
resetArgs := []string{"reset-failed"}
resetArgs = append(resetArgs, services...)
exec.Command("systemctl", resetArgs...).Run()
if err := exec.Command("systemctl", resetArgs...).Run(); err != nil {
fmt.Printf(" ⚠️ Warning: Failed to reset-failed state: %v\n", err)
}
// Wait again after reset-failed
time.Sleep(1 * time.Second)
// Stop again to ensure they're stopped
exec.Command("systemctl", stopArgs...).Run()
if err := exec.Command("systemctl", stopArgs...).Run(); err != nil {
fmt.Printf(" ⚠️ Warning: Second stop attempt had errors: %v\n", err)
}
time.Sleep(1 * time.Second)
hadError := false

View File

@ -60,10 +60,6 @@ func ParseFlags(args []string) (*Flags, error) {
fs.IntVar(&flags.AnyoneBandwidth, "anyone-bandwidth", 30, "Limit relay to N% of VPS bandwidth (0=unlimited, runs speedtest)")
fs.IntVar(&flags.AnyoneAccounting, "anyone-accounting", 0, "Monthly data cap for relay in GB (0=unlimited)")
// Support legacy flags for backwards compatibility
nightly := fs.Bool("nightly", false, "Use nightly branch (deprecated, use --branch nightly)")
main := fs.Bool("main", false, "Use main branch (deprecated, use --branch main)")
if err := fs.Parse(args); err != nil {
if err == flag.ErrHelp {
return nil, err
@ -71,14 +67,6 @@ func ParseFlags(args []string) (*Flags, error) {
return nil, fmt.Errorf("failed to parse flags: %w", err)
}
// Handle legacy flags
if *nightly {
flags.Branch = "nightly"
}
if *main {
flags.Branch = "main"
}
// Set nameserver if explicitly provided
if *nameserver {
flags.Nameserver = nameserver

View File

@ -10,6 +10,8 @@ import (
"strings"
"syscall"
"time"
"github.com/DeBrosOfficial/network/pkg/constants"
)
var ErrServiceNotFound = errors.New("service not found")
@ -22,15 +24,15 @@ type PortSpec struct {
var ServicePorts = map[string][]PortSpec{
"debros-gateway": {
{Name: "Gateway API", Port: 6001},
{Name: "Gateway API", Port: constants.GatewayAPIPort},
},
"debros-olric": {
{Name: "Olric HTTP", Port: 3320},
{Name: "Olric Memberlist", Port: 3322},
{Name: "Olric HTTP", Port: constants.OlricHTTPPort},
{Name: "Olric Memberlist", Port: constants.OlricMemberlistPort},
},
"debros-node": {
{Name: "RQLite HTTP", Port: 5001},
{Name: "RQLite Raft", Port: 7001},
{Name: "RQLite HTTP", Port: constants.RQLiteHTTPPort},
{Name: "RQLite Raft", Port: constants.RQLiteRaftPort},
},
"debros-ipfs": {
{Name: "IPFS API", Port: 4501},
@ -48,12 +50,12 @@ func DefaultPorts() []PortSpec {
{Name: "IPFS Swarm", Port: 4001},
{Name: "IPFS API", Port: 4501},
{Name: "IPFS Gateway", Port: 8080},
{Name: "Gateway API", Port: 6001},
{Name: "RQLite HTTP", Port: 5001},
{Name: "RQLite Raft", Port: 7001},
{Name: "Gateway API", Port: constants.GatewayAPIPort},
{Name: "RQLite HTTP", Port: constants.RQLiteHTTPPort},
{Name: "RQLite Raft", Port: constants.RQLiteRaftPort},
{Name: "IPFS Cluster API", Port: 9094},
{Name: "Olric HTTP", Port: 3320},
{Name: "Olric Memberlist", Port: 3322},
{Name: "Olric HTTP", Port: constants.OlricHTTPPort},
{Name: "Olric Memberlist", Port: constants.OlricMemberlistPort},
}
}

View File

@ -0,0 +1,9 @@
package constants
// Node capacity limits used by both deployment and namespace scheduling.
const (
MaxDeploymentsPerNode = 100
MaxMemoryMB = 8192 // 8GB
MaxCPUPercent = 400 // 400% = 4 cores
MaxPortsPerNode = 9900 // ~10k ports available
)

11
pkg/constants/ports.go Normal file
View File

@ -0,0 +1,11 @@
package constants
// Service ports used across the network.
const (
WireGuardPort = 51820
RQLiteHTTPPort = 5001
RQLiteRaftPort = 7001
OlricHTTPPort = 3320
OlricMemberlistPort = 3322
GatewayAPIPort = 6001
)

View File

@ -6,6 +6,7 @@ import (
"time"
"github.com/DeBrosOfficial/network/pkg/client"
"github.com/DeBrosOfficial/network/pkg/constants"
"github.com/DeBrosOfficial/network/pkg/rqlite"
"go.uber.org/zap"
)
@ -270,7 +271,7 @@ func (hnm *HomeNodeManager) getNodeCapacity(ctx context.Context, nodeID string)
AllocatedPorts: allocatedPorts,
AvailablePorts: availablePorts,
UsedMemoryMB: totalMemoryMB,
AvailableMemoryMB: 8192 - totalMemoryMB, // Assume 8GB per node (make configurable later)
AvailableMemoryMB: constants.MaxMemoryMB - totalMemoryMB,
UsedCPUPercent: totalCPUPercent,
Score: score,
}
@ -331,12 +332,10 @@ func (hnm *HomeNodeManager) getNodeResourceUsage(ctx context.Context, nodeID str
// calculateCapacityScore calculates a 0.0-1.0 score (higher is better)
func (hnm *HomeNodeManager) calculateCapacityScore(deploymentCount, allocatedPorts, availablePorts, usedMemoryMB, usedCPUPercent int) float64 {
const (
maxDeployments = 100 // Max deployments per node
maxMemoryMB = 8192 // 8GB
maxCPUPercent = 400 // 400% = 4 cores
maxPorts = 9900 // ~10k ports available
)
maxDeployments := constants.MaxDeploymentsPerNode
maxMemoryMB := constants.MaxMemoryMB
maxCPUPercent := constants.MaxCPUPercent
maxPorts := constants.MaxPortsPerNode
// Calculate individual component scores (0.0 to 1.0)
deploymentScore := 1.0 - (float64(deploymentCount) / float64(maxDeployments))

View File

@ -3,6 +3,7 @@ package deployments
import (
"context"
"fmt"
"strings"
"time"
"github.com/DeBrosOfficial/network/pkg/client"
@ -216,21 +217,6 @@ func isConflictError(err error) bool {
if err == nil {
return false
}
// RQLite returns constraint violation errors as strings containing "UNIQUE constraint failed"
errStr := err.Error()
return contains(errStr, "UNIQUE") || contains(errStr, "constraint") || contains(errStr, "conflict")
}
// contains checks if a string contains a substring (case-insensitive)
func contains(s, substr string) bool {
return len(s) >= len(substr) && (s == substr || len(s) > len(substr) && findSubstring(s, substr))
}
func findSubstring(s, substr string) bool {
for i := 0; i <= len(s)-len(substr); i++ {
if s[i:i+len(substr)] == substr {
return true
}
}
return false
return strings.Contains(errStr, "UNIQUE") || strings.Contains(errStr, "constraint") || strings.Contains(errStr, "conflict")
}

View File

@ -4,6 +4,7 @@ import (
"context"
"database/sql"
"reflect"
"strings"
"testing"
"github.com/DeBrosOfficial/network/pkg/rqlite"
@ -410,7 +411,7 @@ func TestContains(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := contains(tt.s, tt.substr)
result := strings.Contains(tt.s, tt.substr)
if result != tt.expected {
t.Errorf("contains(%q, %q) = %v, expected %v", tt.s, tt.substr, result, tt.expected)
}

View File

@ -249,9 +249,7 @@ var (
ErrNoNodesAvailable = &DeploymentError{Message: "no nodes available for deployment"}
ErrDeploymentNotFound = &DeploymentError{Message: "deployment not found"}
ErrNamespaceNotAssigned = &DeploymentError{Message: "namespace has no home node assigned"}
ErrInvalidDeploymentType = &DeploymentError{Message: "invalid deployment type"}
ErrSubdomainTaken = &DeploymentError{Message: "subdomain already in use"}
ErrDomainReserved = &DeploymentError{Message: "domain is reserved"}
)
// DeploymentError represents a deployment-related error

View File

@ -429,7 +429,9 @@ func (sg *SecretGenerator) SaveConfig(filename string, content string) error {
}
// Fix ownership
exec.Command("chown", "debros:debros", configPath).Run()
if err := exec.Command("chown", "debros:debros", configPath).Run(); err != nil {
fmt.Printf("Warning: failed to chown %s to debros:debros: %v\n", configPath, err)
}
return nil
}

View File

@ -3,7 +3,6 @@ package production
import (
"os"
"path/filepath"
"strings"
"gopkg.in/yaml.v3"
)
@ -15,10 +14,7 @@ type NodePreferences struct {
AnyoneClient bool `yaml:"anyone_client"`
}
const (
preferencesFile = "preferences.yaml"
legacyBranchFile = ".branch"
)
const preferencesFile = "preferences.yaml"
// SavePreferences saves node preferences to disk
func SavePreferences(oramaDir string, prefs *NodePreferences) error {
@ -38,10 +34,6 @@ func SavePreferences(oramaDir string, prefs *NodePreferences) error {
return err
}
// Also save branch to legacy .branch file for backward compatibility
legacyPath := filepath.Join(oramaDir, legacyBranchFile)
os.WriteFile(legacyPath, []byte(prefs.Branch), 0644)
return nil
}
@ -53,7 +45,7 @@ func LoadPreferences(oramaDir string) *NodePreferences {
Nameserver: false,
}
// Try to load from preferences.yaml first
// Try to load from preferences.yaml
path := filepath.Join(oramaDir, preferencesFile)
if data, err := os.ReadFile(path); err == nil {
if err := yaml.Unmarshal(data, prefs); err == nil {
@ -61,15 +53,6 @@ func LoadPreferences(oramaDir string) *NodePreferences {
}
}
// Fall back to legacy .branch file
legacyPath := filepath.Join(oramaDir, legacyBranchFile)
if data, err := os.ReadFile(legacyPath); err == nil {
branch := strings.TrimSpace(string(data))
if branch != "" {
prefs.Branch = branch
}
}
return prefs
}

View File

@ -5,6 +5,7 @@ import (
"encoding/json"
"fmt"
"io"
"net"
"net/http"
"net/url"
"strings"
@ -234,31 +235,15 @@ func isPrivateOrLocalHost(host string) bool {
}
// Check for localhost variants
if host == "localhost" || host == "::1" {
if host == "localhost" {
return true
}
// Check common private ranges (basic check)
if strings.HasPrefix(host, "10.") ||
strings.HasPrefix(host, "192.168.") ||
strings.HasPrefix(host, "172.16.") ||
strings.HasPrefix(host, "172.17.") ||
strings.HasPrefix(host, "172.18.") ||
strings.HasPrefix(host, "172.19.") ||
strings.HasPrefix(host, "172.20.") ||
strings.HasPrefix(host, "172.21.") ||
strings.HasPrefix(host, "172.22.") ||
strings.HasPrefix(host, "172.23.") ||
strings.HasPrefix(host, "172.24.") ||
strings.HasPrefix(host, "172.25.") ||
strings.HasPrefix(host, "172.26.") ||
strings.HasPrefix(host, "172.27.") ||
strings.HasPrefix(host, "172.28.") ||
strings.HasPrefix(host, "172.29.") ||
strings.HasPrefix(host, "172.30.") ||
strings.HasPrefix(host, "172.31.") {
return true
// Parse as IP and use standard library checks
ip := net.ParseIP(host)
if ip == nil {
return false
}
return false
return ip.IsLoopback() || ip.IsPrivate() || ip.IsLinkLocalUnicast() || ip.IsLinkLocalMulticast()
}

View File

@ -23,9 +23,8 @@ import (
type HTTPGateway struct {
logger *logging.ColoredLogger
config *config.HTTPGatewayConfig
router chi.Router
reverseProxies map[string]*httputil.ReverseProxy
mu sync.RWMutex
router chi.Router
mu sync.RWMutex
server *http.Server
}
@ -46,8 +45,7 @@ func NewHTTPGateway(logger *logging.ColoredLogger, cfg *config.HTTPGatewayConfig
gateway := &HTTPGateway{
logger: logger,
config: cfg,
router: chi.NewRouter(),
reverseProxies: make(map[string]*httputil.ReverseProxy),
router: chi.NewRouter(),
}
// Set up router middleware
@ -110,8 +108,6 @@ func (hg *HTTPGateway) initializeRoutes() error {
}
}
hg.reverseProxies[routeName] = proxy
// Register route handler
hg.registerRouteHandler(routeName, routeConfig, proxy)

View File

@ -1111,35 +1111,6 @@ func (g *Gateway) getDeploymentByDomain(ctx context.Context, domain string) (*de
}
}
// Legacy format: {name}.node-{shortID}.{baseDomain} (backwards compatibility)
if len(parts) == 2 && strings.HasPrefix(parts[1], "node-") {
deploymentName := parts[0]
shortNodeID := parts[1] // e.g., "node-kv4la8"
// Query by name and matching short node ID
query := `
SELECT id, namespace, name, type, port, content_cid, status, home_node_id
FROM deployments
WHERE name = ?
AND ('node-' || substr(home_node_id, 9, 6) = ? OR home_node_id = ?)
AND status = 'active'
LIMIT 1
`
result, err := db.Query(internalCtx, query, deploymentName, shortNodeID, shortNodeID)
if err == nil && len(result.Rows) > 0 {
row := result.Rows[0]
return &deployments.Deployment{
ID: getString(row[0]),
Namespace: getString(row[1]),
Name: getString(row[2]),
Type: deployments.DeploymentType(getString(row[3])),
Port: getInt(row[4]),
ContentCID: getString(row[5]),
Status: deployments.DeploymentStatus(getString(row[6])),
HomeNodeID: getString(row[7]),
}, nil
}
}
}
// Try custom domain from deployment_domains table

View File

@ -9,6 +9,7 @@ import (
"strings"
"time"
"github.com/DeBrosOfficial/network/pkg/wireguard"
"github.com/libp2p/go-libp2p/core/host"
"github.com/libp2p/go-libp2p/core/peer"
"github.com/multiformats/go-multiaddr"
@ -337,16 +338,21 @@ func (pd *PeerDiscovery) updateHeartbeat(ctx context.Context) error {
}
// GetWireGuardIP detects the local WireGuard IP address using the wg0 network
// interface or the WireGuard config file. It does not require a PeerDiscovery
// instance and can be called from anywhere in the gateway package.
// interface, the 'ip' command, or the WireGuard config file.
// It does not require a PeerDiscovery instance and can be called from anywhere
// in the gateway package.
func GetWireGuardIP() (string, error) {
// Method 1: Use 'ip addr show wg0' command (works without root)
ip, err := getWireGuardIPFromCommand()
if err == nil {
// Method 1: Use net.InterfaceByName (shared implementation)
if ip, err := wireguard.GetIP(); err == nil {
return ip, nil
}
// Method 2: Try to read from WireGuard config file (requires root, may fail)
// Method 2: Use 'ip addr show wg0' command (works without root)
if ip, err := getWireGuardIPFromCommand(); err == nil {
return ip, nil
}
// Method 3: Try to read from WireGuard config file (requires root, may fail)
configPath := "/etc/wireguard/wg0.conf"
data, err := os.ReadFile(configPath)
if err == nil {
@ -359,7 +365,6 @@ func GetWireGuardIP() (string, error) {
parts := strings.Split(line, "=")
if len(parts) == 2 {
addrWithCIDR := strings.TrimSpace(parts[1])
// Remove /24 suffix
ip := strings.Split(addrWithCIDR, "/")[0]
ip = strings.TrimSpace(ip)
return ip, nil

229
pkg/inspector/analyzer.go Normal file
View File

@ -0,0 +1,229 @@
package inspector
import (
"bytes"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"strings"
"time"
)
// AnalysisResult holds the AI's analysis of check failures.
type AnalysisResult struct {
Model string
Analysis string
Duration time.Duration
}
// Analyze sends failures and cluster context to OpenRouter for AI analysis.
func Analyze(results *Results, data *ClusterData, model, apiKey string) (*AnalysisResult, error) {
if apiKey == "" {
apiKey = os.Getenv("OPENROUTER_API_KEY")
}
if apiKey == "" {
return nil, fmt.Errorf("no API key: set --api-key or OPENROUTER_API_KEY env")
}
// Build the prompt with failures, warnings, and cluster context
prompt := buildAnalysisPrompt(results, data)
start := time.Now()
response, err := callOpenRouter(model, apiKey, prompt)
if err != nil {
return nil, fmt.Errorf("OpenRouter API call failed: %w", err)
}
return &AnalysisResult{
Model: model,
Analysis: response,
Duration: time.Since(start),
}, nil
}
func buildAnalysisPrompt(results *Results, data *ClusterData) string {
var b strings.Builder
// System context
b.WriteString("You are a distributed systems expert analyzing health check results for an Orama Network cluster.\n")
b.WriteString("The cluster runs RQLite (Raft consensus), Olric (distributed cache), IPFS, CoreDNS, and WireGuard.\n\n")
// Cluster overview
b.WriteString("## Cluster Overview\n")
b.WriteString(fmt.Sprintf("Nodes inspected: %d\n", len(data.Nodes)))
for host, nd := range data.Nodes {
b.WriteString(fmt.Sprintf("- %s (role: %s)\n", host, nd.Node.Role))
}
b.WriteString("\n")
// Summary
passed, failed, warned, skipped := results.Summary()
b.WriteString(fmt.Sprintf("## Check Results: %d passed, %d failed, %d warnings, %d skipped\n\n", passed, failed, warned, skipped))
// List all failures
failures := results.Failures()
if len(failures) > 0 {
b.WriteString("## Failures (CRITICAL)\n")
for _, f := range failures {
node := f.Node
if node == "" {
node = "cluster-wide"
}
b.WriteString(fmt.Sprintf("- [%s] %s (%s): %s\n", f.Severity, f.Name, node, f.Message))
}
b.WriteString("\n")
}
// List all warnings
warnings := results.FailuresAndWarnings()
warningsOnly := make([]CheckResult, 0)
for _, w := range warnings {
if w.Status == StatusWarn {
warningsOnly = append(warningsOnly, w)
}
}
if len(warningsOnly) > 0 {
b.WriteString("## Warnings\n")
for _, w := range warningsOnly {
node := w.Node
if node == "" {
node = "cluster-wide"
}
b.WriteString(fmt.Sprintf("- [%s] %s (%s): %s\n", w.Severity, w.Name, node, w.Message))
}
b.WriteString("\n")
}
// Add raw RQLite status for context (condensed)
b.WriteString("## Raw Cluster Data (condensed)\n")
for host, nd := range data.Nodes {
if nd.RQLite != nil && nd.RQLite.Status != nil {
s := nd.RQLite.Status
b.WriteString(fmt.Sprintf("### %s (RQLite)\n", host))
b.WriteString(fmt.Sprintf(" raft_state=%s term=%d applied=%d commit=%d leader=%s peers=%d voter=%v\n",
s.RaftState, s.Term, s.AppliedIndex, s.CommitIndex, s.LeaderNodeID, s.NumPeers, s.Voter))
if nd.RQLite.Nodes != nil {
b.WriteString(fmt.Sprintf(" /nodes reports %d members:", len(nd.RQLite.Nodes)))
for addr, n := range nd.RQLite.Nodes {
reachable := "ok"
if !n.Reachable {
reachable = "UNREACHABLE"
}
leader := ""
if n.Leader {
leader = " LEADER"
}
b.WriteString(fmt.Sprintf(" %s(%s%s)", addr, reachable, leader))
}
b.WriteString("\n")
}
}
}
b.WriteString("\n## Task\n")
b.WriteString("Analyze the failures and warnings above. For each issue:\n")
b.WriteString("1. Explain the root cause\n")
b.WriteString("2. Assess the severity and impact on the cluster\n")
b.WriteString("3. Suggest specific commands or actions to fix it\n")
b.WriteString("\nBe concise and actionable. Group related issues together. Use markdown formatting.\n")
return b.String()
}
// OpenRouter API types (OpenAI-compatible)
type openRouterRequest struct {
Model string `json:"model"`
Messages []openRouterMessage `json:"messages"`
}
type openRouterMessage struct {
Role string `json:"role"`
Content string `json:"content"`
}
type openRouterResponse struct {
Choices []struct {
Message struct {
Content string `json:"content"`
} `json:"message"`
} `json:"choices"`
Error *struct {
Message string `json:"message"`
Code int `json:"code"`
} `json:"error"`
}
func callOpenRouter(model, apiKey, prompt string) (string, error) {
reqBody := openRouterRequest{
Model: model,
Messages: []openRouterMessage{
{Role: "user", Content: prompt},
},
}
jsonBody, err := json.Marshal(reqBody)
if err != nil {
return "", fmt.Errorf("marshal request: %w", err)
}
req, err := http.NewRequest("POST", "https://openrouter.ai/api/v1/chat/completions", bytes.NewReader(jsonBody))
if err != nil {
return "", fmt.Errorf("create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+apiKey)
client := &http.Client{Timeout: 120 * time.Second}
resp, err := client.Do(req)
if err != nil {
return "", fmt.Errorf("HTTP request: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("read response: %w", err)
}
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("API returned %d: %s", resp.StatusCode, string(body))
}
var orResp openRouterResponse
if err := json.Unmarshal(body, &orResp); err != nil {
return "", fmt.Errorf("unmarshal response: %w", err)
}
if orResp.Error != nil {
return "", fmt.Errorf("API error: %s", orResp.Error.Message)
}
if len(orResp.Choices) == 0 {
return "", fmt.Errorf("no choices in response (raw: %s)", truncate(string(body), 500))
}
content := orResp.Choices[0].Message.Content
if strings.TrimSpace(content) == "" {
return "", fmt.Errorf("model returned empty response (raw: %s)", truncate(string(body), 500))
}
return content, nil
}
func truncate(s string, max int) string {
if len(s) <= max {
return s
}
return s[:max] + "..."
}
// PrintAnalysis writes the AI analysis to the output.
func PrintAnalysis(analysis *AnalysisResult, w io.Writer) {
fmt.Fprintf(w, "\n## AI Analysis (%s)\n", analysis.Model)
fmt.Fprintf(w, "%s\n", strings.Repeat("-", 70))
fmt.Fprintf(w, "%s\n", analysis.Analysis)
fmt.Fprintf(w, "\n(Analysis took %.1fs)\n", analysis.Duration.Seconds())
}

172
pkg/inspector/checker.go Normal file
View File

@ -0,0 +1,172 @@
package inspector
import (
"time"
)
// Severity levels for check results.
type Severity int
const (
Low Severity = iota
Medium
High
Critical
)
func (s Severity) String() string {
switch s {
case Low:
return "LOW"
case Medium:
return "MEDIUM"
case High:
return "HIGH"
case Critical:
return "CRITICAL"
default:
return "UNKNOWN"
}
}
// Status represents the outcome of a check.
type Status string
const (
StatusPass Status = "pass"
StatusFail Status = "fail"
StatusWarn Status = "warn"
StatusSkip Status = "skip"
)
// CheckResult holds the outcome of a single health check.
type CheckResult struct {
ID string `json:"id"` // e.g. "rqlite.leader_exists"
Name string `json:"name"` // "Cluster has exactly one leader"
Subsystem string `json:"subsystem"` // "rqlite"
Severity Severity `json:"severity"`
Status Status `json:"status"`
Message string `json:"message"` // human-readable detail
Node string `json:"node,omitempty"` // which node (empty for cluster-wide)
}
// Results holds all check outcomes.
type Results struct {
Checks []CheckResult `json:"checks"`
Duration time.Duration `json:"duration"`
}
// Summary returns counts by status.
func (r *Results) Summary() (passed, failed, warned, skipped int) {
for _, c := range r.Checks {
switch c.Status {
case StatusPass:
passed++
case StatusFail:
failed++
case StatusWarn:
warned++
case StatusSkip:
skipped++
}
}
return
}
// Failures returns only failed checks.
func (r *Results) Failures() []CheckResult {
var out []CheckResult
for _, c := range r.Checks {
if c.Status == StatusFail {
out = append(out, c)
}
}
return out
}
// FailuresAndWarnings returns failed and warning checks.
func (r *Results) FailuresAndWarnings() []CheckResult {
var out []CheckResult
for _, c := range r.Checks {
if c.Status == StatusFail || c.Status == StatusWarn {
out = append(out, c)
}
}
return out
}
// CheckFunc is the signature for a subsystem check function.
type CheckFunc func(data *ClusterData) []CheckResult
// SubsystemCheckers maps subsystem names to their check functions.
// Populated by checks/ package init or by explicit registration.
var SubsystemCheckers = map[string]CheckFunc{}
// RegisterChecker registers a check function for a subsystem.
func RegisterChecker(subsystem string, fn CheckFunc) {
SubsystemCheckers[subsystem] = fn
}
// RunChecks executes checks for the requested subsystems against collected data.
func RunChecks(data *ClusterData, subsystems []string) *Results {
start := time.Now()
results := &Results{}
shouldCheck := func(name string) bool {
if len(subsystems) == 0 {
return true
}
for _, s := range subsystems {
if s == name || s == "all" {
return true
}
// Alias: "wg" matches "wireguard"
if s == "wg" && name == "wireguard" {
return true
}
}
return false
}
for name, fn := range SubsystemCheckers {
if shouldCheck(name) {
checks := fn(data)
results.Checks = append(results.Checks, checks...)
}
}
results.Duration = time.Since(start)
return results
}
// Pass creates a passing check result.
func Pass(id, name, subsystem, node, msg string, sev Severity) CheckResult {
return CheckResult{
ID: id, Name: name, Subsystem: subsystem,
Severity: sev, Status: StatusPass, Message: msg, Node: node,
}
}
// Fail creates a failing check result.
func Fail(id, name, subsystem, node, msg string, sev Severity) CheckResult {
return CheckResult{
ID: id, Name: name, Subsystem: subsystem,
Severity: sev, Status: StatusFail, Message: msg, Node: node,
}
}
// Warn creates a warning check result.
func Warn(id, name, subsystem, node, msg string, sev Severity) CheckResult {
return CheckResult{
ID: id, Name: name, Subsystem: subsystem,
Severity: sev, Status: StatusWarn, Message: msg, Node: node,
}
}
// Skip creates a skipped check result.
func Skip(id, name, subsystem, node, msg string, sev Severity) CheckResult {
return CheckResult{
ID: id, Name: name, Subsystem: subsystem,
Severity: sev, Status: StatusSkip, Message: msg, Node: node,
}
}

View File

@ -0,0 +1,190 @@
package inspector
import (
"testing"
"time"
)
func TestSummary(t *testing.T) {
r := &Results{
Checks: []CheckResult{
{ID: "a", Status: StatusPass},
{ID: "b", Status: StatusPass},
{ID: "c", Status: StatusFail},
{ID: "d", Status: StatusWarn},
{ID: "e", Status: StatusSkip},
{ID: "f", Status: StatusPass},
},
}
passed, failed, warned, skipped := r.Summary()
if passed != 3 {
t.Errorf("passed: want 3, got %d", passed)
}
if failed != 1 {
t.Errorf("failed: want 1, got %d", failed)
}
if warned != 1 {
t.Errorf("warned: want 1, got %d", warned)
}
if skipped != 1 {
t.Errorf("skipped: want 1, got %d", skipped)
}
}
func TestFailures(t *testing.T) {
r := &Results{
Checks: []CheckResult{
{ID: "a", Status: StatusPass},
{ID: "b", Status: StatusFail},
{ID: "c", Status: StatusWarn},
{ID: "d", Status: StatusFail},
},
}
failures := r.Failures()
if len(failures) != 2 {
t.Fatalf("want 2 failures, got %d", len(failures))
}
for _, f := range failures {
if f.Status != StatusFail {
t.Errorf("expected StatusFail, got %s for check %s", f.Status, f.ID)
}
}
}
func TestFailuresAndWarnings(t *testing.T) {
r := &Results{
Checks: []CheckResult{
{ID: "a", Status: StatusPass},
{ID: "b", Status: StatusFail},
{ID: "c", Status: StatusWarn},
{ID: "d", Status: StatusSkip},
},
}
fw := r.FailuresAndWarnings()
if len(fw) != 2 {
t.Fatalf("want 2 failures+warnings, got %d", len(fw))
}
}
func TestPass(t *testing.T) {
c := Pass("test.id", "Test Name", "sub", "node1", "msg", Critical)
if c.Status != StatusPass {
t.Errorf("want pass, got %s", c.Status)
}
if c.Severity != Critical {
t.Errorf("want Critical, got %s", c.Severity)
}
if c.Node != "node1" {
t.Errorf("want node1, got %s", c.Node)
}
}
func TestFail(t *testing.T) {
c := Fail("test.id", "Test Name", "sub", "", "msg", High)
if c.Status != StatusFail {
t.Errorf("want fail, got %s", c.Status)
}
if c.Node != "" {
t.Errorf("want empty node, got %q", c.Node)
}
}
func TestWarn(t *testing.T) {
c := Warn("test.id", "Test Name", "sub", "n", "msg", Medium)
if c.Status != StatusWarn {
t.Errorf("want warn, got %s", c.Status)
}
}
func TestSkip(t *testing.T) {
c := Skip("test.id", "Test Name", "sub", "n", "msg", Low)
if c.Status != StatusSkip {
t.Errorf("want skip, got %s", c.Status)
}
}
func TestSeverityString(t *testing.T) {
tests := []struct {
sev Severity
want string
}{
{Low, "LOW"},
{Medium, "MEDIUM"},
{High, "HIGH"},
{Critical, "CRITICAL"},
{Severity(99), "UNKNOWN"},
}
for _, tt := range tests {
t.Run(tt.want, func(t *testing.T) {
if got := tt.sev.String(); got != tt.want {
t.Errorf("Severity(%d).String() = %q, want %q", tt.sev, got, tt.want)
}
})
}
}
func TestRunChecks_EmptyData(t *testing.T) {
data := &ClusterData{
Nodes: map[string]*NodeData{},
Duration: time.Second,
}
results := RunChecks(data, nil)
if results == nil {
t.Fatal("RunChecks returned nil")
}
// Should not panic and should return a valid Results
}
func TestRunChecks_FilterBySubsystem(t *testing.T) {
// Register a test checker
called := map[string]bool{}
SubsystemCheckers["test_sub_a"] = func(data *ClusterData) []CheckResult {
called["a"] = true
return []CheckResult{Pass("a.1", "A1", "test_sub_a", "", "ok", Low)}
}
SubsystemCheckers["test_sub_b"] = func(data *ClusterData) []CheckResult {
called["b"] = true
return []CheckResult{Pass("b.1", "B1", "test_sub_b", "", "ok", Low)}
}
defer delete(SubsystemCheckers, "test_sub_a")
defer delete(SubsystemCheckers, "test_sub_b")
data := &ClusterData{Nodes: map[string]*NodeData{}}
// Filter to only "test_sub_a"
results := RunChecks(data, []string{"test_sub_a"})
if !called["a"] {
t.Error("test_sub_a checker was not called")
}
if called["b"] {
t.Error("test_sub_b checker should not have been called")
}
found := false
for _, c := range results.Checks {
if c.ID == "a.1" {
found = true
}
if c.Subsystem == "test_sub_b" {
t.Error("should not have checks from test_sub_b")
}
}
if !found {
t.Error("expected check a.1 in results")
}
}
func TestRunChecks_AliasWG(t *testing.T) {
called := false
SubsystemCheckers["wireguard"] = func(data *ClusterData) []CheckResult {
called = true
return nil
}
defer delete(SubsystemCheckers, "wireguard")
data := &ClusterData{Nodes: map[string]*NodeData{}}
RunChecks(data, []string{"wg"})
if !called {
t.Error("wireguard checker not called via 'wg' alias")
}
}

224
pkg/inspector/checks/dns.go Normal file
View File

@ -0,0 +1,224 @@
package checks
import (
"fmt"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func init() {
inspector.RegisterChecker("dns", CheckDNS)
}
const dnsSub = "dns"
// CheckDNS runs all DNS/CoreDNS health checks against cluster data.
func CheckDNS(data *inspector.ClusterData) []inspector.CheckResult {
var results []inspector.CheckResult
for _, nd := range data.Nodes {
if nd.DNS == nil {
continue
}
results = append(results, checkDNSPerNode(nd)...)
}
results = append(results, checkDNSCrossNode(data)...)
return results
}
func checkDNSPerNode(nd *inspector.NodeData) []inspector.CheckResult {
var r []inspector.CheckResult
dns := nd.DNS
node := nd.Node.Name()
// 4.1 CoreDNS service running
if dns.CoreDNSActive {
r = append(r, inspector.Pass("dns.coredns_active", "CoreDNS service active", dnsSub, node,
"coredns is active", inspector.Critical))
} else {
r = append(r, inspector.Fail("dns.coredns_active", "CoreDNS service active", dnsSub, node,
"coredns is not active", inspector.Critical))
return r
}
// 4.47 Caddy service running
if dns.CaddyActive {
r = append(r, inspector.Pass("dns.caddy_active", "Caddy service active", dnsSub, node,
"caddy is active", inspector.Critical))
} else {
r = append(r, inspector.Fail("dns.caddy_active", "Caddy service active", dnsSub, node,
"caddy is not active", inspector.Critical))
}
// 4.8 DNS port 53 bound
if dns.Port53Bound {
r = append(r, inspector.Pass("dns.port_53", "DNS port 53 bound", dnsSub, node,
"UDP 53 is listening", inspector.Critical))
} else {
r = append(r, inspector.Fail("dns.port_53", "DNS port 53 bound", dnsSub, node,
"UDP 53 is NOT listening", inspector.Critical))
}
// 4.10 HTTP port 80
if dns.Port80Bound {
r = append(r, inspector.Pass("dns.port_80", "HTTP port 80 bound", dnsSub, node,
"TCP 80 is listening", inspector.High))
} else {
r = append(r, inspector.Warn("dns.port_80", "HTTP port 80 bound", dnsSub, node,
"TCP 80 is NOT listening", inspector.High))
}
// 4.11 HTTPS port 443
if dns.Port443Bound {
r = append(r, inspector.Pass("dns.port_443", "HTTPS port 443 bound", dnsSub, node,
"TCP 443 is listening", inspector.Critical))
} else {
r = append(r, inspector.Fail("dns.port_443", "HTTPS port 443 bound", dnsSub, node,
"TCP 443 is NOT listening", inspector.Critical))
}
// 4.3 CoreDNS memory
if dns.CoreDNSMemMB > 0 {
if dns.CoreDNSMemMB < 100 {
r = append(r, inspector.Pass("dns.coredns_memory", "CoreDNS memory healthy", dnsSub, node,
fmt.Sprintf("RSS=%dMB", dns.CoreDNSMemMB), inspector.Medium))
} else if dns.CoreDNSMemMB < 200 {
r = append(r, inspector.Warn("dns.coredns_memory", "CoreDNS memory healthy", dnsSub, node,
fmt.Sprintf("RSS=%dMB (elevated)", dns.CoreDNSMemMB), inspector.Medium))
} else {
r = append(r, inspector.Fail("dns.coredns_memory", "CoreDNS memory healthy", dnsSub, node,
fmt.Sprintf("RSS=%dMB (high)", dns.CoreDNSMemMB), inspector.High))
}
}
// 4.4 CoreDNS restart count
if dns.CoreDNSRestarts == 0 {
r = append(r, inspector.Pass("dns.coredns_restarts", "CoreDNS low restart count", dnsSub, node,
"NRestarts=0", inspector.High))
} else if dns.CoreDNSRestarts <= 3 {
r = append(r, inspector.Warn("dns.coredns_restarts", "CoreDNS low restart count", dnsSub, node,
fmt.Sprintf("NRestarts=%d", dns.CoreDNSRestarts), inspector.High))
} else {
r = append(r, inspector.Fail("dns.coredns_restarts", "CoreDNS low restart count", dnsSub, node,
fmt.Sprintf("NRestarts=%d (crash-looping?)", dns.CoreDNSRestarts), inspector.High))
}
// 4.7 CoreDNS log error rate
if dns.LogErrors == 0 {
r = append(r, inspector.Pass("dns.coredns_log_errors", "No recent CoreDNS errors", dnsSub, node,
"0 errors in last 5 minutes", inspector.High))
} else if dns.LogErrors < 5 {
r = append(r, inspector.Warn("dns.coredns_log_errors", "No recent CoreDNS errors", dnsSub, node,
fmt.Sprintf("%d errors in last 5 minutes", dns.LogErrors), inspector.High))
} else {
r = append(r, inspector.Fail("dns.coredns_log_errors", "No recent CoreDNS errors", dnsSub, node,
fmt.Sprintf("%d errors in last 5 minutes", dns.LogErrors), inspector.High))
}
// 4.14 Corefile exists
if dns.CorefileExists {
r = append(r, inspector.Pass("dns.corefile_exists", "Corefile exists", dnsSub, node,
"/etc/coredns/Corefile present", inspector.High))
} else {
r = append(r, inspector.Fail("dns.corefile_exists", "Corefile exists", dnsSub, node,
"/etc/coredns/Corefile NOT found", inspector.High))
}
// 4.20 SOA resolution
if dns.SOAResolves {
r = append(r, inspector.Pass("dns.soa_resolves", "SOA record resolves", dnsSub, node,
"dig SOA returned result", inspector.Critical))
} else {
r = append(r, inspector.Fail("dns.soa_resolves", "SOA record resolves", dnsSub, node,
"dig SOA returned no result", inspector.Critical))
}
// 4.21 NS records resolve
if dns.NSResolves {
r = append(r, inspector.Pass("dns.ns_resolves", "NS records resolve", dnsSub, node,
fmt.Sprintf("%d NS records returned", dns.NSRecordCount), inspector.Critical))
} else {
r = append(r, inspector.Fail("dns.ns_resolves", "NS records resolve", dnsSub, node,
"dig NS returned no results", inspector.Critical))
}
// 4.23 Wildcard DNS resolution
if dns.WildcardResolves {
r = append(r, inspector.Pass("dns.wildcard_resolves", "Wildcard DNS resolves", dnsSub, node,
"test-wildcard.<domain> returned IP", inspector.Critical))
} else {
r = append(r, inspector.Fail("dns.wildcard_resolves", "Wildcard DNS resolves", dnsSub, node,
"test-wildcard.<domain> returned no IP", inspector.Critical))
}
// 4.24 Base domain A record
if dns.BaseAResolves {
r = append(r, inspector.Pass("dns.base_a_resolves", "Base domain A record resolves", dnsSub, node,
"<domain> A record returned IP", inspector.High))
} else {
r = append(r, inspector.Warn("dns.base_a_resolves", "Base domain A record resolves", dnsSub, node,
"<domain> A record returned no IP", inspector.High))
}
// 4.50 TLS certificate - base domain
if dns.BaseTLSDaysLeft >= 0 {
if dns.BaseTLSDaysLeft > 30 {
r = append(r, inspector.Pass("dns.tls_base", "Base domain TLS cert valid", dnsSub, node,
fmt.Sprintf("%d days until expiry", dns.BaseTLSDaysLeft), inspector.Critical))
} else if dns.BaseTLSDaysLeft > 7 {
r = append(r, inspector.Warn("dns.tls_base", "Base domain TLS cert valid", dnsSub, node,
fmt.Sprintf("%d days until expiry (expiring soon)", dns.BaseTLSDaysLeft), inspector.Critical))
} else {
r = append(r, inspector.Fail("dns.tls_base", "Base domain TLS cert valid", dnsSub, node,
fmt.Sprintf("%d days until expiry (CRITICAL)", dns.BaseTLSDaysLeft), inspector.Critical))
}
}
// 4.51 TLS certificate - wildcard
if dns.WildTLSDaysLeft >= 0 {
if dns.WildTLSDaysLeft > 30 {
r = append(r, inspector.Pass("dns.tls_wildcard", "Wildcard TLS cert valid", dnsSub, node,
fmt.Sprintf("%d days until expiry", dns.WildTLSDaysLeft), inspector.Critical))
} else if dns.WildTLSDaysLeft > 7 {
r = append(r, inspector.Warn("dns.tls_wildcard", "Wildcard TLS cert valid", dnsSub, node,
fmt.Sprintf("%d days until expiry (expiring soon)", dns.WildTLSDaysLeft), inspector.Critical))
} else {
r = append(r, inspector.Fail("dns.tls_wildcard", "Wildcard TLS cert valid", dnsSub, node,
fmt.Sprintf("%d days until expiry (CRITICAL)", dns.WildTLSDaysLeft), inspector.Critical))
}
}
return r
}
func checkDNSCrossNode(data *inspector.ClusterData) []inspector.CheckResult {
var r []inspector.CheckResult
activeCount := 0
totalNS := 0
for _, nd := range data.Nodes {
if nd.DNS == nil {
continue
}
totalNS++
if nd.DNS.CoreDNSActive {
activeCount++
}
}
if totalNS == 0 {
return r
}
if activeCount == totalNS {
r = append(r, inspector.Pass("dns.all_ns_active", "All nameservers running CoreDNS", dnsSub, "",
fmt.Sprintf("%d/%d nameservers active", activeCount, totalNS), inspector.Critical))
} else {
r = append(r, inspector.Fail("dns.all_ns_active", "All nameservers running CoreDNS", dnsSub, "",
fmt.Sprintf("%d/%d nameservers active", activeCount, totalNS), inspector.Critical))
}
return r
}

View File

@ -0,0 +1,232 @@
package checks
import (
"testing"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func TestCheckDNS_CoreDNSInactive(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
nd.DNS = &inspector.DNSData{CoreDNSActive: false}
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckDNS(data)
expectStatus(t, results, "dns.coredns_active", inspector.StatusFail)
// Early return — no port checks
if findCheck(results, "dns.port_53") != nil {
t.Error("should not check ports when CoreDNS inactive")
}
}
func TestCheckDNS_HealthyNode(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
nd.DNS = &inspector.DNSData{
CoreDNSActive: true,
CaddyActive: true,
Port53Bound: true,
Port80Bound: true,
Port443Bound: true,
CoreDNSMemMB: 50,
CoreDNSRestarts: 0,
LogErrors: 0,
CorefileExists: true,
SOAResolves: true,
NSResolves: true,
NSRecordCount: 3,
WildcardResolves: true,
BaseAResolves: true,
BaseTLSDaysLeft: 60,
WildTLSDaysLeft: 60,
}
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckDNS(data)
expectStatus(t, results, "dns.coredns_active", inspector.StatusPass)
expectStatus(t, results, "dns.caddy_active", inspector.StatusPass)
expectStatus(t, results, "dns.port_53", inspector.StatusPass)
expectStatus(t, results, "dns.port_80", inspector.StatusPass)
expectStatus(t, results, "dns.port_443", inspector.StatusPass)
expectStatus(t, results, "dns.coredns_memory", inspector.StatusPass)
expectStatus(t, results, "dns.coredns_restarts", inspector.StatusPass)
expectStatus(t, results, "dns.coredns_log_errors", inspector.StatusPass)
expectStatus(t, results, "dns.corefile_exists", inspector.StatusPass)
expectStatus(t, results, "dns.soa_resolves", inspector.StatusPass)
expectStatus(t, results, "dns.ns_resolves", inspector.StatusPass)
expectStatus(t, results, "dns.wildcard_resolves", inspector.StatusPass)
expectStatus(t, results, "dns.base_a_resolves", inspector.StatusPass)
expectStatus(t, results, "dns.tls_base", inspector.StatusPass)
expectStatus(t, results, "dns.tls_wildcard", inspector.StatusPass)
}
func TestCheckDNS_PortsFailing(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
nd.DNS = &inspector.DNSData{
CoreDNSActive: true,
Port53Bound: false,
Port80Bound: false,
Port443Bound: false,
}
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckDNS(data)
expectStatus(t, results, "dns.port_53", inspector.StatusFail)
expectStatus(t, results, "dns.port_80", inspector.StatusWarn)
expectStatus(t, results, "dns.port_443", inspector.StatusFail)
}
func TestCheckDNS_Memory(t *testing.T) {
tests := []struct {
name string
memMB int
status inspector.Status
}{
{"healthy", 50, inspector.StatusPass},
{"elevated", 150, inspector.StatusWarn},
{"high", 250, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
nd.DNS = &inspector.DNSData{CoreDNSActive: true, CoreDNSMemMB: tt.memMB}
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckDNS(data)
expectStatus(t, results, "dns.coredns_memory", tt.status)
})
}
}
func TestCheckDNS_Restarts(t *testing.T) {
tests := []struct {
name string
restarts int
status inspector.Status
}{
{"zero", 0, inspector.StatusPass},
{"few", 2, inspector.StatusWarn},
{"many", 5, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
nd.DNS = &inspector.DNSData{CoreDNSActive: true, CoreDNSRestarts: tt.restarts}
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckDNS(data)
expectStatus(t, results, "dns.coredns_restarts", tt.status)
})
}
}
func TestCheckDNS_LogErrors(t *testing.T) {
tests := []struct {
name string
errors int
status inspector.Status
}{
{"none", 0, inspector.StatusPass},
{"few", 3, inspector.StatusWarn},
{"many", 10, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
nd.DNS = &inspector.DNSData{CoreDNSActive: true, LogErrors: tt.errors}
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckDNS(data)
expectStatus(t, results, "dns.coredns_log_errors", tt.status)
})
}
}
func TestCheckDNS_TLSExpiry(t *testing.T) {
tests := []struct {
name string
days int
status inspector.Status
}{
{"healthy", 60, inspector.StatusPass},
{"expiring soon", 20, inspector.StatusWarn},
{"critical", 3, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
nd.DNS = &inspector.DNSData{
CoreDNSActive: true,
BaseTLSDaysLeft: tt.days,
WildTLSDaysLeft: tt.days,
}
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckDNS(data)
expectStatus(t, results, "dns.tls_base", tt.status)
expectStatus(t, results, "dns.tls_wildcard", tt.status)
})
}
}
func TestCheckDNS_TLSNotChecked(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
nd.DNS = &inspector.DNSData{
CoreDNSActive: true,
BaseTLSDaysLeft: -1,
WildTLSDaysLeft: -1,
}
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckDNS(data)
// TLS checks should not be emitted when days == -1
if findCheck(results, "dns.tls_base") != nil {
t.Error("should not emit tls_base when days == -1")
}
}
func TestCheckDNS_ResolutionFailures(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
nd.DNS = &inspector.DNSData{
CoreDNSActive: true,
SOAResolves: false,
NSResolves: false,
WildcardResolves: false,
BaseAResolves: false,
}
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckDNS(data)
expectStatus(t, results, "dns.soa_resolves", inspector.StatusFail)
expectStatus(t, results, "dns.ns_resolves", inspector.StatusFail)
expectStatus(t, results, "dns.wildcard_resolves", inspector.StatusFail)
expectStatus(t, results, "dns.base_a_resolves", inspector.StatusWarn)
}
func TestCheckDNS_CrossNode_AllActive(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
for _, host := range []string{"5.5.5.5", "6.6.6.6", "7.7.7.7"} {
nd := makeNodeData(host, "nameserver-ns1")
nd.DNS = &inspector.DNSData{CoreDNSActive: true}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckDNS(data)
expectStatus(t, results, "dns.all_ns_active", inspector.StatusPass)
}
func TestCheckDNS_CrossNode_PartialActive(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
active := []bool{true, true, false}
for i, host := range []string{"5.5.5.5", "6.6.6.6", "7.7.7.7"} {
nd := makeNodeData(host, "nameserver-ns1")
nd.DNS = &inspector.DNSData{CoreDNSActive: active[i]}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckDNS(data)
expectStatus(t, results, "dns.all_ns_active", inspector.StatusFail)
}
func TestCheckDNS_NilData(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckDNS(data)
if len(results) != 0 {
t.Errorf("expected 0 results for nil DNS data, got %d", len(results))
}
}

View File

@ -0,0 +1,74 @@
package checks
import (
"testing"
"time"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
// makeNode creates a test Node with the given host and role.
func makeNode(host, role string) inspector.Node {
return inspector.Node{
Environment: "devnet",
User: "ubuntu",
Host: host,
Password: "test",
Role: role,
}
}
// makeNodeData creates a NodeData with a node but no subsystem data.
func makeNodeData(host, role string) *inspector.NodeData {
return &inspector.NodeData{
Node: makeNode(host, role),
}
}
// makeCluster creates a ClusterData from a map of host → NodeData.
func makeCluster(nodes map[string]*inspector.NodeData) *inspector.ClusterData {
return &inspector.ClusterData{
Nodes: nodes,
Duration: 1 * time.Second,
}
}
// countByStatus counts results with the given status.
func countByStatus(results []inspector.CheckResult, status inspector.Status) int {
n := 0
for _, r := range results {
if r.Status == status {
n++
}
}
return n
}
// findCheck returns a pointer to the first check matching the given ID, or nil.
func findCheck(results []inspector.CheckResult, id string) *inspector.CheckResult {
for i := range results {
if results[i].ID == id {
return &results[i]
}
}
return nil
}
// requireCheck finds a check by ID and fails the test if not found.
func requireCheck(t *testing.T, results []inspector.CheckResult, id string) inspector.CheckResult {
t.Helper()
c := findCheck(results, id)
if c == nil {
t.Fatalf("check %q not found in %d results", id, len(results))
}
return *c
}
// expectStatus asserts that a check with the given ID has the expected status.
func expectStatus(t *testing.T, results []inspector.CheckResult, id string, status inspector.Status) {
t.Helper()
c := requireCheck(t, results, id)
if c.Status != status {
t.Errorf("check %q: want status=%s, got status=%s (msg=%s)", id, status, c.Status, c.Message)
}
}

View File

@ -0,0 +1,232 @@
package checks
import (
"fmt"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func init() {
inspector.RegisterChecker("ipfs", CheckIPFS)
}
const ipfsSub = "ipfs"
// CheckIPFS runs all IPFS health checks against cluster data.
func CheckIPFS(data *inspector.ClusterData) []inspector.CheckResult {
var results []inspector.CheckResult
for _, nd := range data.Nodes {
if nd.IPFS == nil {
continue
}
results = append(results, checkIPFSPerNode(nd, data)...)
}
results = append(results, checkIPFSCrossNode(data)...)
return results
}
func checkIPFSPerNode(nd *inspector.NodeData, data *inspector.ClusterData) []inspector.CheckResult {
var r []inspector.CheckResult
ipfs := nd.IPFS
node := nd.Node.Name()
// 3.1 IPFS daemon running
if ipfs.DaemonActive {
r = append(r, inspector.Pass("ipfs.daemon_active", "IPFS daemon active", ipfsSub, node,
"debros-ipfs is active", inspector.Critical))
} else {
r = append(r, inspector.Fail("ipfs.daemon_active", "IPFS daemon active", ipfsSub, node,
"debros-ipfs is not active", inspector.Critical))
return r
}
// 3.2 IPFS Cluster running
if ipfs.ClusterActive {
r = append(r, inspector.Pass("ipfs.cluster_active", "IPFS Cluster active", ipfsSub, node,
"debros-ipfs-cluster is active", inspector.Critical))
} else {
r = append(r, inspector.Fail("ipfs.cluster_active", "IPFS Cluster active", ipfsSub, node,
"debros-ipfs-cluster is not active", inspector.Critical))
}
// 3.6 Swarm peer count
expectedNodes := countIPFSNodes(data)
if ipfs.SwarmPeerCount >= 0 {
expectedPeers := expectedNodes - 1
if expectedPeers < 0 {
expectedPeers = 0
}
if ipfs.SwarmPeerCount >= expectedPeers {
r = append(r, inspector.Pass("ipfs.swarm_peers", "Swarm peer count sufficient", ipfsSub, node,
fmt.Sprintf("peers=%d (expected >=%d)", ipfs.SwarmPeerCount, expectedPeers), inspector.High))
} else if ipfs.SwarmPeerCount > 0 {
r = append(r, inspector.Warn("ipfs.swarm_peers", "Swarm peer count sufficient", ipfsSub, node,
fmt.Sprintf("peers=%d (expected >=%d)", ipfs.SwarmPeerCount, expectedPeers), inspector.High))
} else {
r = append(r, inspector.Fail("ipfs.swarm_peers", "Swarm peer count sufficient", ipfsSub, node,
fmt.Sprintf("peers=%d (isolated!)", ipfs.SwarmPeerCount), inspector.Critical))
}
}
// 3.12 Cluster peer count
if ipfs.ClusterPeerCount >= 0 {
if ipfs.ClusterPeerCount >= expectedNodes {
r = append(r, inspector.Pass("ipfs.cluster_peers", "Cluster peer count matches expected", ipfsSub, node,
fmt.Sprintf("cluster_peers=%d (expected=%d)", ipfs.ClusterPeerCount, expectedNodes), inspector.Critical))
} else {
r = append(r, inspector.Warn("ipfs.cluster_peers", "Cluster peer count matches expected", ipfsSub, node,
fmt.Sprintf("cluster_peers=%d (expected=%d)", ipfs.ClusterPeerCount, expectedNodes), inspector.Critical))
}
}
// 3.14 Cluster peer errors
if ipfs.ClusterErrors == 0 {
r = append(r, inspector.Pass("ipfs.cluster_errors", "No cluster peer errors", ipfsSub, node,
"all cluster peers healthy", inspector.Critical))
} else {
r = append(r, inspector.Fail("ipfs.cluster_errors", "No cluster peer errors", ipfsSub, node,
fmt.Sprintf("%d peers reporting errors", ipfs.ClusterErrors), inspector.Critical))
}
// 3.20 Repo size vs max
if ipfs.RepoMaxBytes > 0 && ipfs.RepoSizeBytes > 0 {
pct := float64(ipfs.RepoSizeBytes) / float64(ipfs.RepoMaxBytes) * 100
sizeMB := ipfs.RepoSizeBytes / (1024 * 1024)
maxMB := ipfs.RepoMaxBytes / (1024 * 1024)
if pct < 80 {
r = append(r, inspector.Pass("ipfs.repo_size", "Repo size below limit", ipfsSub, node,
fmt.Sprintf("repo=%dMB/%dMB (%.0f%%)", sizeMB, maxMB, pct), inspector.High))
} else if pct < 95 {
r = append(r, inspector.Warn("ipfs.repo_size", "Repo size below limit", ipfsSub, node,
fmt.Sprintf("repo=%dMB/%dMB (%.0f%%)", sizeMB, maxMB, pct), inspector.High))
} else {
r = append(r, inspector.Fail("ipfs.repo_size", "Repo size below limit", ipfsSub, node,
fmt.Sprintf("repo=%dMB/%dMB (%.0f%% NEARLY FULL)", sizeMB, maxMB, pct), inspector.Critical))
}
}
// 3.3 Version
if ipfs.KuboVersion != "" && ipfs.KuboVersion != "unknown" {
r = append(r, inspector.Pass("ipfs.kubo_version", "Kubo version reported", ipfsSub, node,
fmt.Sprintf("kubo=%s", ipfs.KuboVersion), inspector.Low))
}
if ipfs.ClusterVersion != "" && ipfs.ClusterVersion != "unknown" {
r = append(r, inspector.Pass("ipfs.cluster_version", "Cluster version reported", ipfsSub, node,
fmt.Sprintf("cluster=%s", ipfs.ClusterVersion), inspector.Low))
}
// 3.29 Swarm key exists (private swarm)
if ipfs.HasSwarmKey {
r = append(r, inspector.Pass("ipfs.swarm_key", "Swarm key exists (private swarm)", ipfsSub, node,
"swarm.key present", inspector.Critical))
} else {
r = append(r, inspector.Fail("ipfs.swarm_key", "Swarm key exists (private swarm)", ipfsSub, node,
"swarm.key NOT found", inspector.Critical))
}
// 3.30 Bootstrap empty (private swarm)
if ipfs.BootstrapEmpty {
r = append(r, inspector.Pass("ipfs.bootstrap_empty", "Bootstrap list empty (private)", ipfsSub, node,
"no public bootstrap peers", inspector.High))
} else {
r = append(r, inspector.Warn("ipfs.bootstrap_empty", "Bootstrap list empty (private)", ipfsSub, node,
"bootstrap list is not empty (should be empty for private swarm)", inspector.High))
}
return r
}
func checkIPFSCrossNode(data *inspector.ClusterData) []inspector.CheckResult {
var r []inspector.CheckResult
type nodeInfo struct {
name string
ipfs *inspector.IPFSData
}
var nodes []nodeInfo
for _, nd := range data.Nodes {
if nd.IPFS != nil && nd.IPFS.DaemonActive {
nodes = append(nodes, nodeInfo{name: nd.Node.Name(), ipfs: nd.IPFS})
}
}
if len(nodes) < 2 {
return r
}
// Version consistency
kuboVersions := map[string][]string{}
clusterVersions := map[string][]string{}
for _, n := range nodes {
if n.ipfs.KuboVersion != "" && n.ipfs.KuboVersion != "unknown" {
kuboVersions[n.ipfs.KuboVersion] = append(kuboVersions[n.ipfs.KuboVersion], n.name)
}
if n.ipfs.ClusterVersion != "" && n.ipfs.ClusterVersion != "unknown" {
clusterVersions[n.ipfs.ClusterVersion] = append(clusterVersions[n.ipfs.ClusterVersion], n.name)
}
}
if len(kuboVersions) == 1 {
for v := range kuboVersions {
r = append(r, inspector.Pass("ipfs.kubo_version_consistent", "Kubo version consistent", ipfsSub, "",
fmt.Sprintf("version=%s across %d nodes", v, len(nodes)), inspector.Medium))
}
} else if len(kuboVersions) > 1 {
r = append(r, inspector.Warn("ipfs.kubo_version_consistent", "Kubo version consistent", ipfsSub, "",
fmt.Sprintf("%d different versions", len(kuboVersions)), inspector.Medium))
}
if len(clusterVersions) == 1 {
for v := range clusterVersions {
r = append(r, inspector.Pass("ipfs.cluster_version_consistent", "Cluster version consistent", ipfsSub, "",
fmt.Sprintf("version=%s across %d nodes", v, len(nodes)), inspector.Medium))
}
} else if len(clusterVersions) > 1 {
r = append(r, inspector.Warn("ipfs.cluster_version_consistent", "Cluster version consistent", ipfsSub, "",
fmt.Sprintf("%d different versions", len(clusterVersions)), inspector.Medium))
}
// Repo size convergence
var sizes []int64
for _, n := range nodes {
if n.ipfs.RepoSizeBytes > 0 {
sizes = append(sizes, n.ipfs.RepoSizeBytes)
}
}
if len(sizes) >= 2 {
minSize, maxSize := sizes[0], sizes[0]
for _, s := range sizes[1:] {
if s < minSize {
minSize = s
}
if s > maxSize {
maxSize = s
}
}
if minSize > 0 {
ratio := float64(maxSize) / float64(minSize)
if ratio <= 2.0 {
r = append(r, inspector.Pass("ipfs.repo_convergence", "Repo size similar across nodes", ipfsSub, "",
fmt.Sprintf("ratio=%.1fx", ratio), inspector.Medium))
} else {
r = append(r, inspector.Warn("ipfs.repo_convergence", "Repo size similar across nodes", ipfsSub, "",
fmt.Sprintf("ratio=%.1fx (diverged)", ratio), inspector.Medium))
}
}
}
return r
}
func countIPFSNodes(data *inspector.ClusterData) int {
count := 0
for _, nd := range data.Nodes {
if nd.IPFS != nil {
count++
}
}
return count
}

View File

@ -0,0 +1,183 @@
package checks
import (
"testing"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func TestCheckIPFS_DaemonInactive(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.IPFS = &inspector.IPFSData{DaemonActive: false}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckIPFS(data)
expectStatus(t, results, "ipfs.daemon_active", inspector.StatusFail)
// Early return — no swarm peer checks
if findCheck(results, "ipfs.swarm_peers") != nil {
t.Error("should not check swarm_peers when daemon inactive")
}
}
func TestCheckIPFS_HealthyNode(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.IPFS = &inspector.IPFSData{
DaemonActive: true,
ClusterActive: true,
SwarmPeerCount: 0, // single node: expected peers = 0
ClusterPeerCount: 1, // single node cluster
ClusterErrors: 0,
RepoSizeBytes: 500 * 1024 * 1024, // 500MB
RepoMaxBytes: 1024 * 1024 * 1024, // 1GB
KuboVersion: "0.22.0",
ClusterVersion: "1.0.8",
HasSwarmKey: true,
BootstrapEmpty: true,
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckIPFS(data)
expectStatus(t, results, "ipfs.daemon_active", inspector.StatusPass)
expectStatus(t, results, "ipfs.cluster_active", inspector.StatusPass)
expectStatus(t, results, "ipfs.swarm_peers", inspector.StatusPass)
expectStatus(t, results, "ipfs.cluster_peers", inspector.StatusPass)
expectStatus(t, results, "ipfs.cluster_errors", inspector.StatusPass)
expectStatus(t, results, "ipfs.repo_size", inspector.StatusPass)
expectStatus(t, results, "ipfs.swarm_key", inspector.StatusPass)
expectStatus(t, results, "ipfs.bootstrap_empty", inspector.StatusPass)
}
func TestCheckIPFS_SwarmPeers(t *testing.T) {
// Single-node cluster: expected peers = 0
t.Run("enough", func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 2}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckIPFS(data)
// swarm_peers=2, expected=0 → pass
expectStatus(t, results, "ipfs.swarm_peers", inspector.StatusPass)
})
t.Run("low but nonzero", func(t *testing.T) {
// 3-node cluster: expected peers = 2 per node
nd := makeNodeData("1.1.1.1", "node")
nd.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 1} // has 1, expects 2
nd2 := makeNodeData("2.2.2.2", "node")
nd2.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 2}
nd3 := makeNodeData("3.3.3.3", "node")
nd3.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 2}
data := makeCluster(map[string]*inspector.NodeData{
"1.1.1.1": nd, "2.2.2.2": nd2, "3.3.3.3": nd3,
})
results := CheckIPFS(data)
// Node 1.1.1.1 should warn (1 < 2)
found := false
for _, r := range results {
if r.ID == "ipfs.swarm_peers" && r.Node == "ubuntu@1.1.1.1" && r.Status == inspector.StatusWarn {
found = true
}
}
if !found {
t.Error("expected swarm_peers warn for node 1.1.1.1")
}
})
t.Run("zero isolated", func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 0}
nd2 := makeNodeData("2.2.2.2", "node")
nd2.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 1}
data := makeCluster(map[string]*inspector.NodeData{
"1.1.1.1": nd, "2.2.2.2": nd2,
})
results := CheckIPFS(data)
found := false
for _, r := range results {
if r.ID == "ipfs.swarm_peers" && r.Node == "ubuntu@1.1.1.1" && r.Status == inspector.StatusFail {
found = true
}
}
if !found {
t.Error("expected swarm_peers fail for isolated node 1.1.1.1")
}
})
}
func TestCheckIPFS_RepoSize(t *testing.T) {
tests := []struct {
name string
size int64
max int64
status inspector.Status
}{
{"healthy", 500 * 1024 * 1024, 1024 * 1024 * 1024, inspector.StatusPass}, // 50%
{"elevated", 870 * 1024 * 1024, 1024 * 1024 * 1024, inspector.StatusWarn}, // 85%
{"nearly full", 980 * 1024 * 1024, 1024 * 1024 * 1024, inspector.StatusFail}, // 96%
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.IPFS = &inspector.IPFSData{
DaemonActive: true,
RepoSizeBytes: tt.size,
RepoMaxBytes: tt.max,
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckIPFS(data)
expectStatus(t, results, "ipfs.repo_size", tt.status)
})
}
}
func TestCheckIPFS_SwarmKeyMissing(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.IPFS = &inspector.IPFSData{DaemonActive: true, HasSwarmKey: false}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckIPFS(data)
expectStatus(t, results, "ipfs.swarm_key", inspector.StatusFail)
}
func TestCheckIPFS_BootstrapNotEmpty(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.IPFS = &inspector.IPFSData{DaemonActive: true, BootstrapEmpty: false}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckIPFS(data)
expectStatus(t, results, "ipfs.bootstrap_empty", inspector.StatusWarn)
}
func TestCheckIPFS_CrossNode_VersionConsistency(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
for _, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
nd := makeNodeData(host, "node")
nd.IPFS = &inspector.IPFSData{DaemonActive: true, KuboVersion: "0.22.0", ClusterVersion: "1.0.8"}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckIPFS(data)
expectStatus(t, results, "ipfs.kubo_version_consistent", inspector.StatusPass)
expectStatus(t, results, "ipfs.cluster_version_consistent", inspector.StatusPass)
}
func TestCheckIPFS_CrossNode_VersionMismatch(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
versions := []string{"0.22.0", "0.22.0", "0.21.0"}
for i, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
nd := makeNodeData(host, "node")
nd.IPFS = &inspector.IPFSData{DaemonActive: true, KuboVersion: versions[i]}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckIPFS(data)
expectStatus(t, results, "ipfs.kubo_version_consistent", inspector.StatusWarn)
}
func TestCheckIPFS_NilData(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckIPFS(data)
if len(results) != 0 {
t.Errorf("expected 0 results for nil IPFS data, got %d", len(results))
}
}

View File

@ -0,0 +1,155 @@
package checks
import (
"fmt"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func init() {
inspector.RegisterChecker("namespace", CheckNamespace)
}
const nsSub = "namespace"
// CheckNamespace runs all namespace-level health checks.
func CheckNamespace(data *inspector.ClusterData) []inspector.CheckResult {
var results []inspector.CheckResult
for _, nd := range data.Nodes {
if len(nd.Namespaces) == 0 {
continue
}
results = append(results, checkNamespacesPerNode(nd)...)
}
results = append(results, checkNamespacesCrossNode(data)...)
return results
}
func checkNamespacesPerNode(nd *inspector.NodeData) []inspector.CheckResult {
var r []inspector.CheckResult
node := nd.Node.Name()
for _, ns := range nd.Namespaces {
prefix := fmt.Sprintf("ns.%s", ns.Name)
// RQLite health
if ns.RQLiteUp {
r = append(r, inspector.Pass(prefix+".rqlite_up", fmt.Sprintf("Namespace %s RQLite responding", ns.Name), nsSub, node,
fmt.Sprintf("port_base=%d state=%s", ns.PortBase, ns.RQLiteState), inspector.Critical))
} else {
r = append(r, inspector.Fail(prefix+".rqlite_up", fmt.Sprintf("Namespace %s RQLite responding", ns.Name), nsSub, node,
fmt.Sprintf("port_base=%d not responding", ns.PortBase), inspector.Critical))
}
// RQLite Raft state
if ns.RQLiteUp {
switch ns.RQLiteState {
case "Leader", "Follower":
r = append(r, inspector.Pass(prefix+".rqlite_state", fmt.Sprintf("Namespace %s RQLite raft state valid", ns.Name), nsSub, node,
fmt.Sprintf("state=%s", ns.RQLiteState), inspector.Critical))
case "Candidate":
r = append(r, inspector.Warn(prefix+".rqlite_state", fmt.Sprintf("Namespace %s RQLite raft state valid", ns.Name), nsSub, node,
"state=Candidate (election in progress)", inspector.Critical))
default:
r = append(r, inspector.Fail(prefix+".rqlite_state", fmt.Sprintf("Namespace %s RQLite raft state valid", ns.Name), nsSub, node,
fmt.Sprintf("state=%s", ns.RQLiteState), inspector.Critical))
}
}
// RQLite readiness
if ns.RQLiteReady {
r = append(r, inspector.Pass(prefix+".rqlite_ready", fmt.Sprintf("Namespace %s RQLite ready", ns.Name), nsSub, node,
"/readyz OK", inspector.Critical))
} else if ns.RQLiteUp {
r = append(r, inspector.Fail(prefix+".rqlite_ready", fmt.Sprintf("Namespace %s RQLite ready", ns.Name), nsSub, node,
"/readyz failed", inspector.Critical))
}
// Olric health
if ns.OlricUp {
r = append(r, inspector.Pass(prefix+".olric_up", fmt.Sprintf("Namespace %s Olric port listening", ns.Name), nsSub, node,
"memberlist port bound", inspector.High))
} else {
r = append(r, inspector.Fail(prefix+".olric_up", fmt.Sprintf("Namespace %s Olric port listening", ns.Name), nsSub, node,
"memberlist port not bound", inspector.High))
}
// Gateway health
if ns.GatewayUp {
r = append(r, inspector.Pass(prefix+".gateway_up", fmt.Sprintf("Namespace %s Gateway responding", ns.Name), nsSub, node,
fmt.Sprintf("HTTP status=%d", ns.GatewayStatus), inspector.High))
} else {
r = append(r, inspector.Fail(prefix+".gateway_up", fmt.Sprintf("Namespace %s Gateway responding", ns.Name), nsSub, node,
fmt.Sprintf("HTTP status=%d", ns.GatewayStatus), inspector.High))
}
}
return r
}
func checkNamespacesCrossNode(data *inspector.ClusterData) []inspector.CheckResult {
var r []inspector.CheckResult
// Collect all namespace names across nodes
nsNodes := map[string]int{} // namespace name → count of nodes running it
nsHealthy := map[string]int{} // namespace name → count of nodes where all services are up
for _, nd := range data.Nodes {
for _, ns := range nd.Namespaces {
nsNodes[ns.Name]++
if ns.RQLiteUp && ns.OlricUp && ns.GatewayUp {
nsHealthy[ns.Name]++
}
}
}
for name, total := range nsNodes {
healthy := nsHealthy[name]
if healthy == total {
r = append(r, inspector.Pass(
fmt.Sprintf("ns.%s.all_healthy", name),
fmt.Sprintf("Namespace %s healthy on all nodes", name),
nsSub, "",
fmt.Sprintf("%d/%d nodes fully healthy", healthy, total),
inspector.Critical))
} else {
r = append(r, inspector.Fail(
fmt.Sprintf("ns.%s.all_healthy", name),
fmt.Sprintf("Namespace %s healthy on all nodes", name),
nsSub, "",
fmt.Sprintf("%d/%d nodes fully healthy", healthy, total),
inspector.Critical))
}
// Check namespace has quorum (>= N/2+1 RQLite instances)
rqliteUp := 0
for _, nd := range data.Nodes {
for _, ns := range nd.Namespaces {
if ns.Name == name && ns.RQLiteUp {
rqliteUp++
}
}
}
quorumNeeded := total/2 + 1
if rqliteUp >= quorumNeeded {
r = append(r, inspector.Pass(
fmt.Sprintf("ns.%s.quorum", name),
fmt.Sprintf("Namespace %s RQLite quorum", name),
nsSub, "",
fmt.Sprintf("rqlite_up=%d/%d quorum_needed=%d", rqliteUp, total, quorumNeeded),
inspector.Critical))
} else {
r = append(r, inspector.Fail(
fmt.Sprintf("ns.%s.quorum", name),
fmt.Sprintf("Namespace %s RQLite quorum", name),
nsSub, "",
fmt.Sprintf("rqlite_up=%d/%d quorum_needed=%d (QUORUM LOST)", rqliteUp, total, quorumNeeded),
inspector.Critical))
}
}
return r
}

View File

@ -0,0 +1,165 @@
package checks
import (
"testing"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func TestCheckNamespace_PerNodeHealthy(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Namespaces = []inspector.NamespaceData{
{
Name: "myapp",
PortBase: 10000,
RQLiteUp: true,
RQLiteState: "Leader",
RQLiteReady: true,
OlricUp: true,
GatewayUp: true,
GatewayStatus: 200,
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNamespace(data)
expectStatus(t, results, "ns.myapp.rqlite_up", inspector.StatusPass)
expectStatus(t, results, "ns.myapp.rqlite_state", inspector.StatusPass)
expectStatus(t, results, "ns.myapp.rqlite_ready", inspector.StatusPass)
expectStatus(t, results, "ns.myapp.olric_up", inspector.StatusPass)
expectStatus(t, results, "ns.myapp.gateway_up", inspector.StatusPass)
}
func TestCheckNamespace_RQLiteDown(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Namespaces = []inspector.NamespaceData{
{Name: "myapp", PortBase: 10000, RQLiteUp: false},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNamespace(data)
expectStatus(t, results, "ns.myapp.rqlite_up", inspector.StatusFail)
}
func TestCheckNamespace_RQLiteStates(t *testing.T) {
tests := []struct {
state string
status inspector.Status
}{
{"Leader", inspector.StatusPass},
{"Follower", inspector.StatusPass},
{"Candidate", inspector.StatusWarn},
{"Unknown", inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.state, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Namespaces = []inspector.NamespaceData{
{Name: "myapp", PortBase: 10000, RQLiteUp: true, RQLiteState: tt.state},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNamespace(data)
expectStatus(t, results, "ns.myapp.rqlite_state", tt.status)
})
}
}
func TestCheckNamespace_RQLiteNotReady(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Namespaces = []inspector.NamespaceData{
{Name: "myapp", PortBase: 10000, RQLiteUp: true, RQLiteState: "Follower", RQLiteReady: false},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNamespace(data)
expectStatus(t, results, "ns.myapp.rqlite_ready", inspector.StatusFail)
}
func TestCheckNamespace_OlricDown(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Namespaces = []inspector.NamespaceData{
{Name: "myapp", OlricUp: false},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNamespace(data)
expectStatus(t, results, "ns.myapp.olric_up", inspector.StatusFail)
}
func TestCheckNamespace_GatewayDown(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Namespaces = []inspector.NamespaceData{
{Name: "myapp", GatewayUp: false, GatewayStatus: 0},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNamespace(data)
expectStatus(t, results, "ns.myapp.gateway_up", inspector.StatusFail)
}
func TestCheckNamespace_CrossNode_AllHealthy(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
for _, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
nd := makeNodeData(host, "node")
nd.Namespaces = []inspector.NamespaceData{
{Name: "myapp", RQLiteUp: true, OlricUp: true, GatewayUp: true},
}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckNamespace(data)
expectStatus(t, results, "ns.myapp.all_healthy", inspector.StatusPass)
expectStatus(t, results, "ns.myapp.quorum", inspector.StatusPass)
}
func TestCheckNamespace_CrossNode_PartialHealthy(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
for i, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
nd := makeNodeData(host, "node")
nd.Namespaces = []inspector.NamespaceData{
{Name: "myapp", RQLiteUp: true, OlricUp: i < 2, GatewayUp: true},
}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckNamespace(data)
expectStatus(t, results, "ns.myapp.all_healthy", inspector.StatusFail)
// Quorum should still pass (3/3 RQLite up, need 2)
expectStatus(t, results, "ns.myapp.quorum", inspector.StatusPass)
}
func TestCheckNamespace_CrossNode_QuorumLost(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
rqliteUp := []bool{true, false, false}
for i, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
nd := makeNodeData(host, "node")
nd.Namespaces = []inspector.NamespaceData{
{Name: "myapp", RQLiteUp: rqliteUp[i], OlricUp: true, GatewayUp: true},
}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckNamespace(data)
expectStatus(t, results, "ns.myapp.quorum", inspector.StatusFail)
}
func TestCheckNamespace_MultipleNamespaces(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Namespaces = []inspector.NamespaceData{
{Name: "app1", RQLiteUp: true, RQLiteState: "Leader", OlricUp: true, GatewayUp: true},
{Name: "app2", RQLiteUp: false, OlricUp: true, GatewayUp: true},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNamespace(data)
expectStatus(t, results, "ns.app1.rqlite_up", inspector.StatusPass)
expectStatus(t, results, "ns.app2.rqlite_up", inspector.StatusFail)
}
func TestCheckNamespace_NoNamespaces(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Namespaces = nil
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNamespace(data)
// No per-node results, only cross-node (which should be empty since no namespaces)
for _, r := range results {
t.Errorf("unexpected check: %s", r.ID)
}
}

View File

@ -0,0 +1,113 @@
package checks
import (
"fmt"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func init() {
inspector.RegisterChecker("network", CheckNetwork)
}
const networkSub = "network"
// CheckNetwork runs all network-level health checks.
func CheckNetwork(data *inspector.ClusterData) []inspector.CheckResult {
var results []inspector.CheckResult
for _, nd := range data.Nodes {
if nd.Network == nil {
continue
}
results = append(results, checkNetworkPerNode(nd)...)
}
return results
}
func checkNetworkPerNode(nd *inspector.NodeData) []inspector.CheckResult {
var r []inspector.CheckResult
net := nd.Network
node := nd.Node.Name()
// 7.2 Internet connectivity
if net.InternetReachable {
r = append(r, inspector.Pass("network.internet", "Internet reachable (ping 8.8.8.8)", networkSub, node,
"ping 8.8.8.8 succeeded", inspector.High))
} else {
r = append(r, inspector.Fail("network.internet", "Internet reachable (ping 8.8.8.8)", networkSub, node,
"ping 8.8.8.8 failed", inspector.High))
}
// 7.14 Default route
if net.DefaultRoute {
r = append(r, inspector.Pass("network.default_route", "Default route exists", networkSub, node,
"default route present", inspector.Critical))
} else {
r = append(r, inspector.Fail("network.default_route", "Default route exists", networkSub, node,
"no default route", inspector.Critical))
}
// 7.15 WG subnet route
if net.WGRouteExists {
r = append(r, inspector.Pass("network.wg_route", "WG subnet route exists", networkSub, node,
"10.0.0.0/24 via wg0 present", inspector.Critical))
} else {
r = append(r, inspector.Fail("network.wg_route", "WG subnet route exists", networkSub, node,
"10.0.0.0/24 route via wg0 NOT found", inspector.Critical))
}
// 7.4 TCP connections
if net.TCPEstablished > 0 {
if net.TCPEstablished < 5000 {
r = append(r, inspector.Pass("network.tcp_established", "TCP connections reasonable", networkSub, node,
fmt.Sprintf("established=%d", net.TCPEstablished), inspector.Medium))
} else {
r = append(r, inspector.Warn("network.tcp_established", "TCP connections reasonable", networkSub, node,
fmt.Sprintf("established=%d (high)", net.TCPEstablished), inspector.Medium))
}
}
// 7.6 TIME_WAIT
if net.TCPTimeWait < 10000 {
r = append(r, inspector.Pass("network.tcp_timewait", "TIME_WAIT count low", networkSub, node,
fmt.Sprintf("timewait=%d", net.TCPTimeWait), inspector.Medium))
} else {
r = append(r, inspector.Warn("network.tcp_timewait", "TIME_WAIT count low", networkSub, node,
fmt.Sprintf("timewait=%d (accumulating)", net.TCPTimeWait), inspector.Medium))
}
// 7.8 TCP retransmission rate
if net.TCPRetransRate >= 0 {
if net.TCPRetransRate < 1 {
r = append(r, inspector.Pass("network.tcp_retrans", "TCP retransmission rate low", networkSub, node,
fmt.Sprintf("retrans=%.2f%%", net.TCPRetransRate), inspector.Medium))
} else if net.TCPRetransRate < 5 {
r = append(r, inspector.Warn("network.tcp_retrans", "TCP retransmission rate low", networkSub, node,
fmt.Sprintf("retrans=%.2f%% (elevated)", net.TCPRetransRate), inspector.Medium))
} else {
r = append(r, inspector.Fail("network.tcp_retrans", "TCP retransmission rate low", networkSub, node,
fmt.Sprintf("retrans=%.2f%% (high packet loss)", net.TCPRetransRate), inspector.High))
}
}
// 7.10 WG mesh peer pings (NxN connectivity)
if len(net.PingResults) > 0 {
failCount := 0
for _, ok := range net.PingResults {
if !ok {
failCount++
}
}
if failCount == 0 {
r = append(r, inspector.Pass("network.wg_mesh_ping", "All WG peers reachable via ping", networkSub, node,
fmt.Sprintf("%d/%d peers pingable", len(net.PingResults), len(net.PingResults)), inspector.Critical))
} else {
r = append(r, inspector.Fail("network.wg_mesh_ping", "All WG peers reachable via ping", networkSub, node,
fmt.Sprintf("%d/%d peers unreachable", failCount, len(net.PingResults)), inspector.Critical))
}
}
return r
}

View File

@ -0,0 +1,151 @@
package checks
import (
"testing"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func TestCheckNetwork_HealthyNode(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Network = &inspector.NetworkData{
InternetReachable: true,
DefaultRoute: true,
WGRouteExists: true,
TCPEstablished: 100,
TCPTimeWait: 50,
TCPRetransRate: 0.1,
PingResults: map[string]bool{"10.0.0.2": true, "10.0.0.3": true},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNetwork(data)
expectStatus(t, results, "network.internet", inspector.StatusPass)
expectStatus(t, results, "network.default_route", inspector.StatusPass)
expectStatus(t, results, "network.wg_route", inspector.StatusPass)
expectStatus(t, results, "network.tcp_established", inspector.StatusPass)
expectStatus(t, results, "network.tcp_timewait", inspector.StatusPass)
expectStatus(t, results, "network.tcp_retrans", inspector.StatusPass)
expectStatus(t, results, "network.wg_mesh_ping", inspector.StatusPass)
}
func TestCheckNetwork_InternetUnreachable(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Network = &inspector.NetworkData{InternetReachable: false}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNetwork(data)
expectStatus(t, results, "network.internet", inspector.StatusFail)
}
func TestCheckNetwork_MissingRoutes(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Network = &inspector.NetworkData{DefaultRoute: false, WGRouteExists: false}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNetwork(data)
expectStatus(t, results, "network.default_route", inspector.StatusFail)
expectStatus(t, results, "network.wg_route", inspector.StatusFail)
}
func TestCheckNetwork_TCPConnections(t *testing.T) {
tests := []struct {
name string
estab int
status inspector.Status
}{
{"normal", 100, inspector.StatusPass},
{"high", 6000, inspector.StatusWarn},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Network = &inspector.NetworkData{TCPEstablished: tt.estab}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNetwork(data)
expectStatus(t, results, "network.tcp_established", tt.status)
})
}
}
func TestCheckNetwork_TCPTimeWait(t *testing.T) {
tests := []struct {
name string
tw int
status inspector.Status
}{
{"normal", 50, inspector.StatusPass},
{"high", 15000, inspector.StatusWarn},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Network = &inspector.NetworkData{TCPTimeWait: tt.tw}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNetwork(data)
expectStatus(t, results, "network.tcp_timewait", tt.status)
})
}
}
func TestCheckNetwork_TCPRetransmission(t *testing.T) {
tests := []struct {
name string
rate float64
status inspector.Status
}{
{"low", 0.1, inspector.StatusPass},
{"elevated", 3.0, inspector.StatusWarn},
{"high", 8.0, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Network = &inspector.NetworkData{TCPRetransRate: tt.rate}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNetwork(data)
expectStatus(t, results, "network.tcp_retrans", tt.status)
})
}
}
func TestCheckNetwork_WGMeshPing(t *testing.T) {
t.Run("all ok", func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Network = &inspector.NetworkData{
PingResults: map[string]bool{"10.0.0.2": true, "10.0.0.3": true},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNetwork(data)
expectStatus(t, results, "network.wg_mesh_ping", inspector.StatusPass)
})
t.Run("some fail", func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Network = &inspector.NetworkData{
PingResults: map[string]bool{"10.0.0.2": true, "10.0.0.3": false},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNetwork(data)
expectStatus(t, results, "network.wg_mesh_ping", inspector.StatusFail)
})
t.Run("no pings", func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Network = &inspector.NetworkData{PingResults: map[string]bool{}}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNetwork(data)
// No ping results → no wg_mesh_ping check
if findCheck(results, "network.wg_mesh_ping") != nil {
t.Error("should not emit wg_mesh_ping when no ping results")
}
})
}
func TestCheckNetwork_NilData(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNetwork(data)
if len(results) != 0 {
t.Errorf("expected 0 results for nil Network data, got %d", len(results))
}
}

View File

@ -0,0 +1,157 @@
package checks
import (
"fmt"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func init() {
inspector.RegisterChecker("olric", CheckOlric)
}
const olricSub = "olric"
// CheckOlric runs all Olric health checks against cluster data.
func CheckOlric(data *inspector.ClusterData) []inspector.CheckResult {
var results []inspector.CheckResult
for _, nd := range data.Nodes {
if nd.Olric == nil {
continue
}
results = append(results, checkOlricPerNode(nd)...)
}
results = append(results, checkOlricCrossNode(data)...)
return results
}
func checkOlricPerNode(nd *inspector.NodeData) []inspector.CheckResult {
var r []inspector.CheckResult
ol := nd.Olric
node := nd.Node.Name()
// 2.1 Service active
if ol.ServiceActive {
r = append(r, inspector.Pass("olric.service_active", "Olric service active", olricSub, node,
"debros-olric is active", inspector.Critical))
} else {
r = append(r, inspector.Fail("olric.service_active", "Olric service active", olricSub, node,
"debros-olric is not active", inspector.Critical))
return r
}
// 2.7 Memberlist port accepting connections
if ol.MemberlistUp {
r = append(r, inspector.Pass("olric.memberlist_port", "Memberlist port 3322 listening", olricSub, node,
"TCP 3322 is bound", inspector.Critical))
} else {
r = append(r, inspector.Fail("olric.memberlist_port", "Memberlist port 3322 listening", olricSub, node,
"TCP 3322 is not listening", inspector.Critical))
}
// 2.3 Restart count
if ol.RestartCount == 0 {
r = append(r, inspector.Pass("olric.restarts", "Low restart count", olricSub, node,
"NRestarts=0", inspector.High))
} else if ol.RestartCount <= 3 {
r = append(r, inspector.Warn("olric.restarts", "Low restart count", olricSub, node,
fmt.Sprintf("NRestarts=%d", ol.RestartCount), inspector.High))
} else {
r = append(r, inspector.Fail("olric.restarts", "Low restart count", olricSub, node,
fmt.Sprintf("NRestarts=%d (crash-looping?)", ol.RestartCount), inspector.High))
}
// 2.4 Process memory
if ol.ProcessMemMB > 0 {
if ol.ProcessMemMB < 200 {
r = append(r, inspector.Pass("olric.memory", "Process memory healthy", olricSub, node,
fmt.Sprintf("RSS=%dMB", ol.ProcessMemMB), inspector.Medium))
} else if ol.ProcessMemMB < 500 {
r = append(r, inspector.Warn("olric.memory", "Process memory healthy", olricSub, node,
fmt.Sprintf("RSS=%dMB (elevated)", ol.ProcessMemMB), inspector.Medium))
} else {
r = append(r, inspector.Fail("olric.memory", "Process memory healthy", olricSub, node,
fmt.Sprintf("RSS=%dMB (high)", ol.ProcessMemMB), inspector.High))
}
}
// 2.9-2.11 Log analysis: suspects
if ol.LogSuspects == 0 {
r = append(r, inspector.Pass("olric.log_suspects", "No suspect/failed members in logs", olricSub, node,
"no suspect messages in last hour", inspector.Critical))
} else {
r = append(r, inspector.Fail("olric.log_suspects", "No suspect/failed members in logs", olricSub, node,
fmt.Sprintf("%d suspect/failed messages in last hour", ol.LogSuspects), inspector.Critical))
}
// 2.13 Flapping detection
if ol.LogFlapping < 5 {
r = append(r, inspector.Pass("olric.log_flapping", "No rapid join/leave cycles", olricSub, node,
fmt.Sprintf("join/leave events=%d in last hour", ol.LogFlapping), inspector.High))
} else {
r = append(r, inspector.Warn("olric.log_flapping", "No rapid join/leave cycles", olricSub, node,
fmt.Sprintf("join/leave events=%d in last hour (flapping?)", ol.LogFlapping), inspector.High))
}
// 2.39 Log error rate
if ol.LogErrors < 5 {
r = append(r, inspector.Pass("olric.log_errors", "Log error rate low", olricSub, node,
fmt.Sprintf("errors=%d in last hour", ol.LogErrors), inspector.High))
} else if ol.LogErrors < 20 {
r = append(r, inspector.Warn("olric.log_errors", "Log error rate low", olricSub, node,
fmt.Sprintf("errors=%d in last hour", ol.LogErrors), inspector.High))
} else {
r = append(r, inspector.Fail("olric.log_errors", "Log error rate low", olricSub, node,
fmt.Sprintf("errors=%d in last hour (high)", ol.LogErrors), inspector.High))
}
return r
}
func checkOlricCrossNode(data *inspector.ClusterData) []inspector.CheckResult {
var r []inspector.CheckResult
activeCount := 0
memberlistCount := 0
totalNodes := 0
for _, nd := range data.Nodes {
if nd.Olric == nil {
continue
}
totalNodes++
if nd.Olric.ServiceActive {
activeCount++
}
if nd.Olric.MemberlistUp {
memberlistCount++
}
}
if totalNodes < 2 {
return r
}
// All nodes have Olric running
if activeCount == totalNodes {
r = append(r, inspector.Pass("olric.all_active", "All nodes running Olric", olricSub, "",
fmt.Sprintf("%d/%d nodes active", activeCount, totalNodes), inspector.Critical))
} else {
r = append(r, inspector.Fail("olric.all_active", "All nodes running Olric", olricSub, "",
fmt.Sprintf("%d/%d nodes active", activeCount, totalNodes), inspector.Critical))
}
// All memberlist ports up
if memberlistCount == totalNodes {
r = append(r, inspector.Pass("olric.all_memberlist", "All memberlist ports listening", olricSub, "",
fmt.Sprintf("%d/%d nodes with memberlist", memberlistCount, totalNodes), inspector.High))
} else {
r = append(r, inspector.Warn("olric.all_memberlist", "All memberlist ports listening", olricSub, "",
fmt.Sprintf("%d/%d nodes with memberlist", memberlistCount, totalNodes), inspector.High))
}
return r
}

View File

@ -0,0 +1,149 @@
package checks
import (
"testing"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func TestCheckOlric_ServiceInactive(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Olric = &inspector.OlricData{ServiceActive: false}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckOlric(data)
expectStatus(t, results, "olric.service_active", inspector.StatusFail)
// Should return early — no further per-node checks
if findCheck(results, "olric.memberlist_port") != nil {
t.Error("should not check memberlist when service inactive")
}
}
func TestCheckOlric_HealthyNode(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Olric = &inspector.OlricData{
ServiceActive: true,
MemberlistUp: true,
RestartCount: 0,
ProcessMemMB: 100,
LogSuspects: 0,
LogFlapping: 0,
LogErrors: 0,
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckOlric(data)
expectStatus(t, results, "olric.service_active", inspector.StatusPass)
expectStatus(t, results, "olric.memberlist_port", inspector.StatusPass)
expectStatus(t, results, "olric.restarts", inspector.StatusPass)
expectStatus(t, results, "olric.log_suspects", inspector.StatusPass)
expectStatus(t, results, "olric.log_flapping", inspector.StatusPass)
expectStatus(t, results, "olric.log_errors", inspector.StatusPass)
}
func TestCheckOlric_RestartCounts(t *testing.T) {
tests := []struct {
name string
restarts int
status inspector.Status
}{
{"zero", 0, inspector.StatusPass},
{"few", 2, inspector.StatusWarn},
{"many", 5, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Olric = &inspector.OlricData{ServiceActive: true, RestartCount: tt.restarts}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckOlric(data)
expectStatus(t, results, "olric.restarts", tt.status)
})
}
}
func TestCheckOlric_Memory(t *testing.T) {
tests := []struct {
name string
memMB int
status inspector.Status
}{
{"healthy", 100, inspector.StatusPass},
{"elevated", 300, inspector.StatusWarn},
{"high", 600, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Olric = &inspector.OlricData{ServiceActive: true, ProcessMemMB: tt.memMB}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckOlric(data)
expectStatus(t, results, "olric.memory", tt.status)
})
}
}
func TestCheckOlric_LogSuspects(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Olric = &inspector.OlricData{ServiceActive: true, LogSuspects: 5}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckOlric(data)
expectStatus(t, results, "olric.log_suspects", inspector.StatusFail)
}
func TestCheckOlric_LogErrors(t *testing.T) {
tests := []struct {
name string
errors int
status inspector.Status
}{
{"none", 0, inspector.StatusPass},
{"few", 10, inspector.StatusWarn},
{"many", 30, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Olric = &inspector.OlricData{ServiceActive: true, LogErrors: tt.errors}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckOlric(data)
expectStatus(t, results, "olric.log_errors", tt.status)
})
}
}
func TestCheckOlric_CrossNode_AllActive(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
for _, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
nd := makeNodeData(host, "node")
nd.Olric = &inspector.OlricData{ServiceActive: true, MemberlistUp: true}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckOlric(data)
expectStatus(t, results, "olric.all_active", inspector.StatusPass)
expectStatus(t, results, "olric.all_memberlist", inspector.StatusPass)
}
func TestCheckOlric_CrossNode_PartialActive(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
for i, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
nd := makeNodeData(host, "node")
nd.Olric = &inspector.OlricData{ServiceActive: i < 2, MemberlistUp: i < 2}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckOlric(data)
expectStatus(t, results, "olric.all_active", inspector.StatusFail)
}
func TestCheckOlric_NilData(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckOlric(data)
if len(results) != 0 {
t.Errorf("expected 0 results for nil Olric data, got %d", len(results))
}
}

View File

@ -0,0 +1,533 @@
package checks
import (
"fmt"
"math"
"strings"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func init() {
inspector.RegisterChecker("rqlite", CheckRQLite)
}
const rqliteSub = "rqlite"
// CheckRQLite runs all RQLite health checks against cluster data.
func CheckRQLite(data *inspector.ClusterData) []inspector.CheckResult {
var results []inspector.CheckResult
// Per-node checks
for _, nd := range data.Nodes {
if nd.RQLite == nil {
continue
}
results = append(results, checkRQLitePerNode(nd, data)...)
}
// Cross-node checks
results = append(results, checkRQLiteCrossNode(data)...)
return results
}
func checkRQLitePerNode(nd *inspector.NodeData, data *inspector.ClusterData) []inspector.CheckResult {
var r []inspector.CheckResult
rq := nd.RQLite
node := nd.Node.Name()
// 1.2 HTTP endpoint responsive
if !rq.Responsive {
r = append(r, inspector.Fail("rqlite.responsive", "RQLite HTTP endpoint responsive", rqliteSub, node,
"curl localhost:5001/status failed or returned error", inspector.Critical))
return r
}
r = append(r, inspector.Pass("rqlite.responsive", "RQLite HTTP endpoint responsive", rqliteSub, node,
"responding on port 5001", inspector.Critical))
// 1.3 Full readiness (/readyz)
if rq.Readyz != nil {
if rq.Readyz.Ready {
r = append(r, inspector.Pass("rqlite.readyz", "Full readiness check", rqliteSub, node,
"node, leader, store all ready", inspector.Critical))
} else {
var parts []string
if rq.Readyz.Node != "ready" {
parts = append(parts, "node: "+rq.Readyz.Node)
}
if rq.Readyz.Leader != "ready" {
parts = append(parts, "leader: "+rq.Readyz.Leader)
}
if rq.Readyz.Store != "ready" {
parts = append(parts, "store: "+rq.Readyz.Store)
}
r = append(r, inspector.Fail("rqlite.readyz", "Full readiness check", rqliteSub, node,
"not ready: "+strings.Join(parts, ", "), inspector.Critical))
}
}
s := rq.Status
if s == nil {
r = append(r, inspector.Skip("rqlite.status_parsed", "Status JSON parseable", rqliteSub, node,
"could not parse /status response", inspector.Critical))
return r
}
// 1.5 Raft state valid
switch s.RaftState {
case "Leader", "Follower":
r = append(r, inspector.Pass("rqlite.raft_state", "Raft state valid", rqliteSub, node,
fmt.Sprintf("state=%s", s.RaftState), inspector.Critical))
case "Candidate":
r = append(r, inspector.Warn("rqlite.raft_state", "Raft state valid", rqliteSub, node,
"state=Candidate (election in progress)", inspector.Critical))
case "Shutdown":
r = append(r, inspector.Fail("rqlite.raft_state", "Raft state valid", rqliteSub, node,
"state=Shutdown", inspector.Critical))
default:
r = append(r, inspector.Fail("rqlite.raft_state", "Raft state valid", rqliteSub, node,
fmt.Sprintf("unexpected state=%q", s.RaftState), inspector.Critical))
}
// 1.7 Leader identity known
if s.LeaderNodeID == "" {
r = append(r, inspector.Fail("rqlite.leader_known", "Leader identity known", rqliteSub, node,
"leader node_id is empty", inspector.Critical))
} else {
r = append(r, inspector.Pass("rqlite.leader_known", "Leader identity known", rqliteSub, node,
fmt.Sprintf("leader=%s", s.LeaderNodeID), inspector.Critical))
}
// 1.8 Voter status
if s.Voter {
r = append(r, inspector.Pass("rqlite.voter", "Node is voter", rqliteSub, node,
"voter=true", inspector.Low))
} else {
r = append(r, inspector.Warn("rqlite.voter", "Node is voter", rqliteSub, node,
"voter=false (non-voter)", inspector.Low))
}
// 1.9 Num peers — use the node's own /nodes endpoint to determine cluster size
// (not config file, since not all config nodes are necessarily in the Raft cluster)
if rq.Nodes != nil && len(rq.Nodes) > 0 {
expectedPeers := len(rq.Nodes) - 1 // cluster members minus self
if expectedPeers < 0 {
expectedPeers = 0
}
if s.NumPeers == expectedPeers {
r = append(r, inspector.Pass("rqlite.num_peers", "Peer count matches cluster size", rqliteSub, node,
fmt.Sprintf("peers=%d (cluster has %d nodes)", s.NumPeers, len(rq.Nodes)), inspector.Critical))
} else {
r = append(r, inspector.Warn("rqlite.num_peers", "Peer count matches cluster size", rqliteSub, node,
fmt.Sprintf("peers=%d but /nodes reports %d members", s.NumPeers, len(rq.Nodes)), inspector.High))
}
} else {
r = append(r, inspector.Pass("rqlite.num_peers", "Peer count reported", rqliteSub, node,
fmt.Sprintf("peers=%d", s.NumPeers), inspector.Medium))
}
// 1.11 Commit index vs applied index
if s.CommitIndex > 0 && s.AppliedIndex > 0 {
gap := s.CommitIndex - s.AppliedIndex
if s.AppliedIndex > s.CommitIndex {
gap = 0
}
if gap <= 2 {
r = append(r, inspector.Pass("rqlite.commit_applied_gap", "Commit/applied index close", rqliteSub, node,
fmt.Sprintf("commit=%d applied=%d gap=%d", s.CommitIndex, s.AppliedIndex, gap), inspector.Critical))
} else if gap <= 100 {
r = append(r, inspector.Warn("rqlite.commit_applied_gap", "Commit/applied index close", rqliteSub, node,
fmt.Sprintf("commit=%d applied=%d gap=%d (lagging)", s.CommitIndex, s.AppliedIndex, gap), inspector.Critical))
} else {
r = append(r, inspector.Fail("rqlite.commit_applied_gap", "Commit/applied index close", rqliteSub, node,
fmt.Sprintf("commit=%d applied=%d gap=%d (severely behind)", s.CommitIndex, s.AppliedIndex, gap), inspector.Critical))
}
}
// 1.12 FSM pending
if s.FsmPending == 0 {
r = append(r, inspector.Pass("rqlite.fsm_pending", "FSM pending queue empty", rqliteSub, node,
"fsm_pending=0", inspector.High))
} else if s.FsmPending <= 10 {
r = append(r, inspector.Warn("rqlite.fsm_pending", "FSM pending queue empty", rqliteSub, node,
fmt.Sprintf("fsm_pending=%d", s.FsmPending), inspector.High))
} else {
r = append(r, inspector.Fail("rqlite.fsm_pending", "FSM pending queue empty", rqliteSub, node,
fmt.Sprintf("fsm_pending=%d (backlog)", s.FsmPending), inspector.High))
}
// 1.13 Last contact (followers only)
if s.RaftState == "Follower" && s.LastContact != "" {
r = append(r, inspector.Pass("rqlite.last_contact", "Follower last contact recent", rqliteSub, node,
fmt.Sprintf("last_contact=%s", s.LastContact), inspector.Critical))
}
// 1.14 Last log term matches current term
if s.LastLogTerm > 0 && s.Term > 0 {
if s.LastLogTerm == s.Term {
r = append(r, inspector.Pass("rqlite.log_term_match", "Last log term matches current", rqliteSub, node,
fmt.Sprintf("term=%d last_log_term=%d", s.Term, s.LastLogTerm), inspector.Medium))
} else {
r = append(r, inspector.Warn("rqlite.log_term_match", "Last log term matches current", rqliteSub, node,
fmt.Sprintf("term=%d last_log_term=%d (mismatch)", s.Term, s.LastLogTerm), inspector.Medium))
}
}
// 1.15 db_applied_index == fsm_index
if s.DBAppliedIndex > 0 && s.FsmIndex > 0 {
if s.DBAppliedIndex == s.FsmIndex {
r = append(r, inspector.Pass("rqlite.db_fsm_sync", "DB applied index matches FSM index", rqliteSub, node,
fmt.Sprintf("db_applied=%d fsm=%d", s.DBAppliedIndex, s.FsmIndex), inspector.Critical))
} else {
r = append(r, inspector.Fail("rqlite.db_fsm_sync", "DB applied index matches FSM index", rqliteSub, node,
fmt.Sprintf("db_applied=%d fsm=%d (diverged)", s.DBAppliedIndex, s.FsmIndex), inspector.Critical))
}
}
// 1.18 Last snapshot index close to applied
if s.LastSnapshot > 0 && s.AppliedIndex > 0 {
gap := s.AppliedIndex - s.LastSnapshot
if s.LastSnapshot > s.AppliedIndex {
gap = 0
}
if gap < 10000 {
r = append(r, inspector.Pass("rqlite.snapshot_recent", "Snapshot recent", rqliteSub, node,
fmt.Sprintf("snapshot_index=%d applied=%d gap=%d", s.LastSnapshot, s.AppliedIndex, gap), inspector.Medium))
} else {
r = append(r, inspector.Warn("rqlite.snapshot_recent", "Snapshot recent", rqliteSub, node,
fmt.Sprintf("snapshot_index=%d applied=%d gap=%d (old snapshot)", s.LastSnapshot, s.AppliedIndex, gap), inspector.Medium))
}
}
// 1.19 At least 1 snapshot exists
if s.LastSnapshot > 0 {
r = append(r, inspector.Pass("rqlite.has_snapshot", "At least one snapshot exists", rqliteSub, node,
fmt.Sprintf("last_snapshot_index=%d", s.LastSnapshot), inspector.Medium))
} else {
r = append(r, inspector.Warn("rqlite.has_snapshot", "At least one snapshot exists", rqliteSub, node,
"no snapshots found", inspector.Medium))
}
// 1.27 Database size
if s.DBSizeFriendly != "" {
r = append(r, inspector.Pass("rqlite.db_size", "Database size reported", rqliteSub, node,
fmt.Sprintf("db_size=%s", s.DBSizeFriendly), inspector.Low))
}
// 1.31 Goroutine count
if s.Goroutines > 0 {
if s.Goroutines < 200 {
r = append(r, inspector.Pass("rqlite.goroutines", "Goroutine count healthy", rqliteSub, node,
fmt.Sprintf("goroutines=%d", s.Goroutines), inspector.Medium))
} else if s.Goroutines < 1000 {
r = append(r, inspector.Warn("rqlite.goroutines", "Goroutine count healthy", rqliteSub, node,
fmt.Sprintf("goroutines=%d (elevated)", s.Goroutines), inspector.Medium))
} else {
r = append(r, inspector.Fail("rqlite.goroutines", "Goroutine count healthy", rqliteSub, node,
fmt.Sprintf("goroutines=%d (high)", s.Goroutines), inspector.High))
}
}
// 1.32 Memory (HeapAlloc)
if s.HeapAlloc > 0 {
mb := s.HeapAlloc / (1024 * 1024)
if mb < 500 {
r = append(r, inspector.Pass("rqlite.memory", "Memory usage healthy", rqliteSub, node,
fmt.Sprintf("heap=%dMB", mb), inspector.Medium))
} else if mb < 1000 {
r = append(r, inspector.Warn("rqlite.memory", "Memory usage healthy", rqliteSub, node,
fmt.Sprintf("heap=%dMB (elevated)", mb), inspector.Medium))
} else {
r = append(r, inspector.Fail("rqlite.memory", "Memory usage healthy", rqliteSub, node,
fmt.Sprintf("heap=%dMB (high)", mb), inspector.High))
}
}
// 1.35 Version reported
if s.Version != "" {
r = append(r, inspector.Pass("rqlite.version", "Version reported", rqliteSub, node,
fmt.Sprintf("version=%s", s.Version), inspector.Low))
}
// Node reachability from /nodes endpoint
if rq.Nodes != nil {
unreachable := 0
for addr, n := range rq.Nodes {
if !n.Reachable {
unreachable++
r = append(r, inspector.Fail("rqlite.node_reachable", "Cluster node reachable", rqliteSub, node,
fmt.Sprintf("%s is unreachable from this node", addr), inspector.Critical))
}
}
if unreachable == 0 {
r = append(r, inspector.Pass("rqlite.all_reachable", "All cluster nodes reachable", rqliteSub, node,
fmt.Sprintf("all %d nodes reachable", len(rq.Nodes)), inspector.Critical))
}
}
// 1.46 Strong read test
if rq.StrongRead {
r = append(r, inspector.Pass("rqlite.strong_read", "Strong read succeeds", rqliteSub, node,
"SELECT 1 at level=strong OK", inspector.Critical))
} else if rq.Responsive {
r = append(r, inspector.Fail("rqlite.strong_read", "Strong read succeeds", rqliteSub, node,
"SELECT 1 at level=strong failed", inspector.Critical))
}
// Debug vars checks
if dv := rq.DebugVars; dv != nil {
// 1.28 Query errors
if dv.QueryErrors == 0 {
r = append(r, inspector.Pass("rqlite.query_errors", "No query errors", rqliteSub, node,
"query_errors=0", inspector.High))
} else {
r = append(r, inspector.Warn("rqlite.query_errors", "No query errors", rqliteSub, node,
fmt.Sprintf("query_errors=%d", dv.QueryErrors), inspector.High))
}
// 1.29 Execute errors
if dv.ExecuteErrors == 0 {
r = append(r, inspector.Pass("rqlite.execute_errors", "No execute errors", rqliteSub, node,
"execute_errors=0", inspector.High))
} else {
r = append(r, inspector.Warn("rqlite.execute_errors", "No execute errors", rqliteSub, node,
fmt.Sprintf("execute_errors=%d", dv.ExecuteErrors), inspector.High))
}
// 1.30 Leader not found events
if dv.LeaderNotFound == 0 {
r = append(r, inspector.Pass("rqlite.leader_not_found", "No leader-not-found events", rqliteSub, node,
"leader_not_found=0", inspector.Critical))
} else {
r = append(r, inspector.Fail("rqlite.leader_not_found", "No leader-not-found events", rqliteSub, node,
fmt.Sprintf("leader_not_found=%d", dv.LeaderNotFound), inspector.Critical))
}
// Snapshot errors
if dv.SnapshotErrors == 0 {
r = append(r, inspector.Pass("rqlite.snapshot_errors", "No snapshot errors", rqliteSub, node,
"snapshot_errors=0", inspector.High))
} else {
r = append(r, inspector.Fail("rqlite.snapshot_errors", "No snapshot errors", rqliteSub, node,
fmt.Sprintf("snapshot_errors=%d", dv.SnapshotErrors), inspector.High))
}
// Client retries/timeouts
if dv.ClientRetries == 0 && dv.ClientTimeouts == 0 {
r = append(r, inspector.Pass("rqlite.client_health", "No client retries or timeouts", rqliteSub, node,
"retries=0 timeouts=0", inspector.Medium))
} else {
r = append(r, inspector.Warn("rqlite.client_health", "No client retries or timeouts", rqliteSub, node,
fmt.Sprintf("retries=%d timeouts=%d", dv.ClientRetries, dv.ClientTimeouts), inspector.Medium))
}
}
return r
}
func checkRQLiteCrossNode(data *inspector.ClusterData) []inspector.CheckResult {
var r []inspector.CheckResult
type nodeInfo struct {
host string
name string
status *inspector.RQLiteStatus
}
var nodes []nodeInfo
for host, nd := range data.Nodes {
if nd.RQLite != nil && nd.RQLite.Status != nil {
nodes = append(nodes, nodeInfo{host: host, name: nd.Node.Name(), status: nd.RQLite.Status})
}
}
if len(nodes) < 2 {
r = append(r, inspector.Skip("rqlite.cross_node", "Cross-node checks", rqliteSub, "",
fmt.Sprintf("only %d node(s) with RQLite data, need >=2", len(nodes)), inspector.Critical))
return r
}
// 1.5 Exactly one leader
leaders := 0
var leaderName string
for _, n := range nodes {
if n.status.RaftState == "Leader" {
leaders++
leaderName = n.name
}
}
switch leaders {
case 1:
r = append(r, inspector.Pass("rqlite.single_leader", "Exactly one leader in cluster", rqliteSub, "",
fmt.Sprintf("leader=%s", leaderName), inspector.Critical))
case 0:
r = append(r, inspector.Fail("rqlite.single_leader", "Exactly one leader in cluster", rqliteSub, "",
"no leader found", inspector.Critical))
default:
r = append(r, inspector.Fail("rqlite.single_leader", "Exactly one leader in cluster", rqliteSub, "",
fmt.Sprintf("found %d leaders (split brain!)", leaders), inspector.Critical))
}
// 1.6 Term consistency
terms := map[uint64][]string{}
for _, n := range nodes {
terms[n.status.Term] = append(terms[n.status.Term], n.name)
}
if len(terms) == 1 {
for t := range terms {
r = append(r, inspector.Pass("rqlite.term_consistent", "All nodes same Raft term", rqliteSub, "",
fmt.Sprintf("term=%d across %d nodes", t, len(nodes)), inspector.Critical))
}
} else {
var parts []string
for t, names := range terms {
parts = append(parts, fmt.Sprintf("term=%d: %s", t, strings.Join(names, ",")))
}
r = append(r, inspector.Fail("rqlite.term_consistent", "All nodes same Raft term", rqliteSub, "",
"term divergence: "+strings.Join(parts, "; "), inspector.Critical))
}
// 1.36 All nodes agree on same leader
leaderIDs := map[string][]string{}
for _, n := range nodes {
leaderIDs[n.status.LeaderNodeID] = append(leaderIDs[n.status.LeaderNodeID], n.name)
}
if len(leaderIDs) == 1 {
for lid := range leaderIDs {
r = append(r, inspector.Pass("rqlite.leader_agreement", "All nodes agree on leader", rqliteSub, "",
fmt.Sprintf("leader_id=%s", lid), inspector.Critical))
}
} else {
var parts []string
for lid, names := range leaderIDs {
id := lid
if id == "" {
id = "(none)"
}
parts = append(parts, fmt.Sprintf("%s: %s", id, strings.Join(names, ",")))
}
r = append(r, inspector.Fail("rqlite.leader_agreement", "All nodes agree on leader", rqliteSub, "",
"leader disagreement: "+strings.Join(parts, "; "), inspector.Critical))
}
// 1.38 Applied index convergence
var minApplied, maxApplied uint64
hasApplied := false
for _, n := range nodes {
idx := n.status.AppliedIndex
if idx == 0 {
continue
}
if !hasApplied {
minApplied = idx
maxApplied = idx
hasApplied = true
continue
}
if idx < minApplied {
minApplied = idx
}
if idx > maxApplied {
maxApplied = idx
}
}
if hasApplied && maxApplied > 0 {
gap := maxApplied - minApplied
if gap < 100 {
r = append(r, inspector.Pass("rqlite.index_convergence", "Applied index convergence", rqliteSub, "",
fmt.Sprintf("min=%d max=%d gap=%d", minApplied, maxApplied, gap), inspector.Critical))
} else if gap < 1000 {
r = append(r, inspector.Warn("rqlite.index_convergence", "Applied index convergence", rqliteSub, "",
fmt.Sprintf("min=%d max=%d gap=%d (lagging)", minApplied, maxApplied, gap), inspector.Critical))
} else {
r = append(r, inspector.Fail("rqlite.index_convergence", "Applied index convergence", rqliteSub, "",
fmt.Sprintf("min=%d max=%d gap=%d (severely behind)", minApplied, maxApplied, gap), inspector.Critical))
}
}
// 1.35 Version consistency
versions := map[string][]string{}
for _, n := range nodes {
if n.status.Version != "" {
versions[n.status.Version] = append(versions[n.status.Version], n.name)
}
}
if len(versions) == 1 {
for v := range versions {
r = append(r, inspector.Pass("rqlite.version_consistent", "Version consistent across nodes", rqliteSub, "",
fmt.Sprintf("version=%s", v), inspector.Medium))
}
} else if len(versions) > 1 {
var parts []string
for v, names := range versions {
parts = append(parts, fmt.Sprintf("%s: %s", v, strings.Join(names, ",")))
}
r = append(r, inspector.Warn("rqlite.version_consistent", "Version consistent across nodes", rqliteSub, "",
"version mismatch: "+strings.Join(parts, "; "), inspector.Medium))
}
// 1.40 Database size convergence
type sizeEntry struct {
name string
size int64
}
var sizes []sizeEntry
for _, n := range nodes {
if n.status.DBSize > 0 {
sizes = append(sizes, sizeEntry{n.name, n.status.DBSize})
}
}
if len(sizes) >= 2 {
minSize := sizes[0].size
maxSize := sizes[0].size
for _, s := range sizes[1:] {
if s.size < minSize {
minSize = s.size
}
if s.size > maxSize {
maxSize = s.size
}
}
if minSize > 0 {
ratio := float64(maxSize) / float64(minSize)
if ratio <= 1.05 {
r = append(r, inspector.Pass("rqlite.db_size_convergence", "Database size converged", rqliteSub, "",
fmt.Sprintf("min=%dB max=%dB ratio=%.2f", minSize, maxSize, ratio), inspector.Medium))
} else {
r = append(r, inspector.Warn("rqlite.db_size_convergence", "Database size converged", rqliteSub, "",
fmt.Sprintf("min=%dB max=%dB ratio=%.2f (diverged)", minSize, maxSize, ratio), inspector.High))
}
}
}
// 1.42 Quorum math
voters := 0
reachableVoters := 0
for _, n := range nodes {
if n.status.Voter {
voters++
reachableVoters++ // responded to SSH + curl = reachable
}
}
quorumNeeded := int(math.Floor(float64(voters)/2)) + 1
if reachableVoters >= quorumNeeded {
r = append(r, inspector.Pass("rqlite.quorum", "Quorum maintained", rqliteSub, "",
fmt.Sprintf("reachable_voters=%d/%d quorum_needed=%d", reachableVoters, voters, quorumNeeded), inspector.Critical))
} else {
r = append(r, inspector.Fail("rqlite.quorum", "Quorum maintained", rqliteSub, "",
fmt.Sprintf("reachable_voters=%d/%d quorum_needed=%d (QUORUM LOST)", reachableVoters, voters, quorumNeeded), inspector.Critical))
}
return r
}
// countRQLiteNodes counts nodes that have RQLite data.
func countRQLiteNodes(data *inspector.ClusterData) int {
count := 0
for _, nd := range data.Nodes {
if nd.RQLite != nil {
count++
}
}
return count
}

View File

@ -0,0 +1,401 @@
package checks
import (
"testing"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func TestCheckRQLite_Unresponsive(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.RQLite = &inspector.RQLiteData{Responsive: false}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.responsive", inspector.StatusFail)
// Should return early — no raft_state check
if findCheck(results, "rqlite.raft_state") != nil {
t.Error("should not check raft_state when unresponsive")
}
}
func TestCheckRQLite_HealthyLeader(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
StrongRead: true,
Readyz: &inspector.RQLiteReadyz{Ready: true, Node: "ready", Leader: "ready", Store: "ready"},
Status: &inspector.RQLiteStatus{
RaftState: "Leader",
LeaderNodeID: "node1",
Voter: true,
NumPeers: 2,
Term: 5,
CommitIndex: 1000,
AppliedIndex: 1000,
FsmPending: 0,
LastLogTerm: 5,
DBAppliedIndex: 1000,
FsmIndex: 1000,
LastSnapshot: 995,
DBSizeFriendly: "1.2MB",
Goroutines: 50,
HeapAlloc: 100 * 1024 * 1024, // 100MB
Version: "8.0.0",
},
Nodes: map[string]*inspector.RQLiteNode{
"node1:5001": {Addr: "node1:5001", Reachable: true, Leader: true, Voter: true},
"node2:5001": {Addr: "node2:5001", Reachable: true, Leader: false, Voter: true},
"node3:5001": {Addr: "node3:5001", Reachable: true, Leader: false, Voter: true},
},
DebugVars: &inspector.RQLiteDebugVars{},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.responsive", inspector.StatusPass)
expectStatus(t, results, "rqlite.readyz", inspector.StatusPass)
expectStatus(t, results, "rqlite.raft_state", inspector.StatusPass)
expectStatus(t, results, "rqlite.leader_known", inspector.StatusPass)
expectStatus(t, results, "rqlite.voter", inspector.StatusPass)
expectStatus(t, results, "rqlite.commit_applied_gap", inspector.StatusPass)
expectStatus(t, results, "rqlite.fsm_pending", inspector.StatusPass)
expectStatus(t, results, "rqlite.db_fsm_sync", inspector.StatusPass)
expectStatus(t, results, "rqlite.strong_read", inspector.StatusPass)
expectStatus(t, results, "rqlite.all_reachable", inspector.StatusPass)
expectStatus(t, results, "rqlite.goroutines", inspector.StatusPass)
expectStatus(t, results, "rqlite.memory", inspector.StatusPass)
expectStatus(t, results, "rqlite.query_errors", inspector.StatusPass)
expectStatus(t, results, "rqlite.execute_errors", inspector.StatusPass)
expectStatus(t, results, "rqlite.leader_not_found", inspector.StatusPass)
expectStatus(t, results, "rqlite.snapshot_errors", inspector.StatusPass)
expectStatus(t, results, "rqlite.client_health", inspector.StatusPass)
}
func TestCheckRQLite_RaftStates(t *testing.T) {
tests := []struct {
state string
status inspector.Status
}{
{"Leader", inspector.StatusPass},
{"Follower", inspector.StatusPass},
{"Candidate", inspector.StatusWarn},
{"Shutdown", inspector.StatusFail},
{"Unknown", inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.state, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Status: &inspector.RQLiteStatus{
RaftState: tt.state,
LeaderNodeID: "node1",
Voter: true,
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.raft_state", tt.status)
})
}
}
func TestCheckRQLite_ReadyzFail(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Readyz: &inspector.RQLiteReadyz{Ready: false, Node: "ready", Leader: "not ready", Store: "ready"},
Status: &inspector.RQLiteStatus{RaftState: "Follower", LeaderNodeID: "n1", Voter: true},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.readyz", inspector.StatusFail)
}
func TestCheckRQLite_CommitAppliedGap(t *testing.T) {
tests := []struct {
name string
commit uint64
applied uint64
status inspector.Status
}{
{"no gap", 1000, 1000, inspector.StatusPass},
{"small gap", 1002, 1000, inspector.StatusPass},
{"lagging", 1050, 1000, inspector.StatusWarn},
{"severely behind", 2000, 1000, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Status: &inspector.RQLiteStatus{
RaftState: "Follower",
LeaderNodeID: "n1",
Voter: true,
CommitIndex: tt.commit,
AppliedIndex: tt.applied,
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.commit_applied_gap", tt.status)
})
}
}
func TestCheckRQLite_FsmPending(t *testing.T) {
tests := []struct {
name string
pending uint64
status inspector.Status
}{
{"zero", 0, inspector.StatusPass},
{"small", 5, inspector.StatusWarn},
{"backlog", 100, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Status: &inspector.RQLiteStatus{
RaftState: "Follower",
LeaderNodeID: "n1",
Voter: true,
FsmPending: tt.pending,
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.fsm_pending", tt.status)
})
}
}
func TestCheckRQLite_StrongReadFail(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
StrongRead: false,
Status: &inspector.RQLiteStatus{RaftState: "Follower", LeaderNodeID: "n1", Voter: true},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.strong_read", inspector.StatusFail)
}
func TestCheckRQLite_DebugVarsErrors(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Status: &inspector.RQLiteStatus{RaftState: "Leader", LeaderNodeID: "n1", Voter: true},
DebugVars: &inspector.RQLiteDebugVars{
QueryErrors: 5,
ExecuteErrors: 3,
LeaderNotFound: 1,
SnapshotErrors: 2,
ClientRetries: 10,
ClientTimeouts: 1,
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.query_errors", inspector.StatusWarn)
expectStatus(t, results, "rqlite.execute_errors", inspector.StatusWarn)
expectStatus(t, results, "rqlite.leader_not_found", inspector.StatusFail)
expectStatus(t, results, "rqlite.snapshot_errors", inspector.StatusFail)
expectStatus(t, results, "rqlite.client_health", inspector.StatusWarn)
}
func TestCheckRQLite_Goroutines(t *testing.T) {
tests := []struct {
name string
goroutines int
status inspector.Status
}{
{"healthy", 50, inspector.StatusPass},
{"elevated", 500, inspector.StatusWarn},
{"high", 2000, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Status: &inspector.RQLiteStatus{
RaftState: "Leader",
LeaderNodeID: "n1",
Voter: true,
Goroutines: tt.goroutines,
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.goroutines", tt.status)
})
}
}
// --- Cross-node tests ---
func makeRQLiteCluster(leaderHost string, states map[string]string, term uint64) *inspector.ClusterData {
nodes := map[string]*inspector.NodeData{}
rqliteNodes := map[string]*inspector.RQLiteNode{}
for host := range states {
rqliteNodes[host+":5001"] = &inspector.RQLiteNode{
Addr: host + ":5001", Reachable: true, Voter: true,
Leader: states[host] == "Leader",
}
}
for host, state := range states {
nd := makeNodeData(host, "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Status: &inspector.RQLiteStatus{
RaftState: state,
LeaderNodeID: leaderHost,
Voter: true,
Term: term,
AppliedIndex: 1000,
CommitIndex: 1000,
Version: "8.0.0",
DBSize: 4096,
},
Nodes: rqliteNodes,
}
nodes[host] = nd
}
return makeCluster(nodes)
}
func TestCheckRQLite_CrossNode_SingleLeader(t *testing.T) {
data := makeRQLiteCluster("1.1.1.1", map[string]string{
"1.1.1.1": "Leader",
"2.2.2.2": "Follower",
"3.3.3.3": "Follower",
}, 5)
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.single_leader", inspector.StatusPass)
expectStatus(t, results, "rqlite.term_consistent", inspector.StatusPass)
expectStatus(t, results, "rqlite.leader_agreement", inspector.StatusPass)
expectStatus(t, results, "rqlite.index_convergence", inspector.StatusPass)
expectStatus(t, results, "rqlite.version_consistent", inspector.StatusPass)
expectStatus(t, results, "rqlite.quorum", inspector.StatusPass)
}
func TestCheckRQLite_CrossNode_NoLeader(t *testing.T) {
data := makeRQLiteCluster("", map[string]string{
"1.1.1.1": "Candidate",
"2.2.2.2": "Candidate",
"3.3.3.3": "Candidate",
}, 5)
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.single_leader", inspector.StatusFail)
}
func TestCheckRQLite_CrossNode_SplitBrain(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
for _, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
nd := makeNodeData(host, "node")
state := "Follower"
leaderID := "1.1.1.1"
if host == "1.1.1.1" || host == "2.2.2.2" {
state = "Leader"
leaderID = host
}
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Status: &inspector.RQLiteStatus{
RaftState: state,
LeaderNodeID: leaderID,
Voter: true,
Term: 5,
AppliedIndex: 1000,
},
}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.single_leader", inspector.StatusFail)
}
func TestCheckRQLite_CrossNode_TermDivergence(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
terms := map[string]uint64{"1.1.1.1": 5, "2.2.2.2": 5, "3.3.3.3": 6}
for host, term := range terms {
nd := makeNodeData(host, "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Status: &inspector.RQLiteStatus{
RaftState: "Follower",
LeaderNodeID: "1.1.1.1",
Voter: true,
Term: term,
AppliedIndex: 1000,
},
}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.term_consistent", inspector.StatusFail)
}
func TestCheckRQLite_CrossNode_IndexLagging(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
applied := map[string]uint64{"1.1.1.1": 1000, "2.2.2.2": 1000, "3.3.3.3": 500}
for host, idx := range applied {
nd := makeNodeData(host, "node")
state := "Follower"
if host == "1.1.1.1" {
state = "Leader"
}
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Status: &inspector.RQLiteStatus{
RaftState: state,
LeaderNodeID: "1.1.1.1",
Voter: true,
Term: 5,
AppliedIndex: idx,
CommitIndex: idx,
},
}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.index_convergence", inspector.StatusWarn)
}
func TestCheckRQLite_CrossNode_SkipSingleNode(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Status: &inspector.RQLiteStatus{RaftState: "Leader", LeaderNodeID: "n1", Voter: true, Term: 5, AppliedIndex: 1000},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.cross_node", inspector.StatusSkip)
}
func TestCheckRQLite_NilRQLiteData(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
// nd.RQLite is nil — no per-node checks, but cross-node skip is expected
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
// Should only have the cross-node skip (not enough nodes)
for _, r := range results {
if r.Status != inspector.StatusSkip {
t.Errorf("unexpected non-skip result: %s (status=%s)", r.ID, r.Status)
}
}
}

View File

@ -0,0 +1,242 @@
package checks
import (
"fmt"
"strconv"
"strings"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func init() {
inspector.RegisterChecker("system", CheckSystem)
}
const systemSub = "system"
// CheckSystem runs all system-level health checks.
func CheckSystem(data *inspector.ClusterData) []inspector.CheckResult {
var results []inspector.CheckResult
for _, nd := range data.Nodes {
if nd.System == nil {
continue
}
results = append(results, checkSystemPerNode(nd)...)
}
return results
}
func checkSystemPerNode(nd *inspector.NodeData) []inspector.CheckResult {
var r []inspector.CheckResult
sys := nd.System
node := nd.Node.Name()
// 6.1 Core services active
coreServices := []string{"debros-node", "debros-olric", "debros-ipfs", "debros-ipfs-cluster"}
for _, svc := range coreServices {
status, ok := sys.Services[svc]
if !ok {
status = "unknown"
}
id := fmt.Sprintf("system.svc_%s", strings.ReplaceAll(svc, "-", "_"))
name := fmt.Sprintf("%s service active", svc)
if status == "active" {
r = append(r, inspector.Pass(id, name, systemSub, node, "active", inspector.Critical))
} else {
r = append(r, inspector.Fail(id, name, systemSub, node,
fmt.Sprintf("status=%s", status), inspector.Critical))
}
}
// 6.5 WireGuard service
if status, ok := sys.Services["wg-quick@wg0"]; ok {
if status == "active" {
r = append(r, inspector.Pass("system.svc_wg", "wg-quick@wg0 active", systemSub, node, "active", inspector.Critical))
} else {
r = append(r, inspector.Fail("system.svc_wg", "wg-quick@wg0 active", systemSub, node,
fmt.Sprintf("status=%s", status), inspector.Critical))
}
}
// 6.3 Nameserver services (if applicable)
if nd.Node.IsNameserver() {
for _, svc := range []string{"coredns", "caddy"} {
status, ok := sys.Services[svc]
if !ok {
status = "unknown"
}
id := fmt.Sprintf("system.svc_%s", svc)
name := fmt.Sprintf("%s service active", svc)
if status == "active" {
r = append(r, inspector.Pass(id, name, systemSub, node, "active", inspector.Critical))
} else {
r = append(r, inspector.Fail(id, name, systemSub, node,
fmt.Sprintf("status=%s", status), inspector.Critical))
}
}
}
// 6.6 Failed systemd units
if len(sys.FailedUnits) == 0 {
r = append(r, inspector.Pass("system.no_failed_units", "No failed systemd units", systemSub, node,
"no failed units", inspector.High))
} else {
r = append(r, inspector.Fail("system.no_failed_units", "No failed systemd units", systemSub, node,
fmt.Sprintf("failed: %s", strings.Join(sys.FailedUnits, ", ")), inspector.High))
}
// 6.14 Memory usage
if sys.MemTotalMB > 0 {
pct := float64(sys.MemUsedMB) / float64(sys.MemTotalMB) * 100
if pct < 80 {
r = append(r, inspector.Pass("system.memory", "Memory usage healthy", systemSub, node,
fmt.Sprintf("used=%dMB/%dMB (%.0f%%)", sys.MemUsedMB, sys.MemTotalMB, pct), inspector.Medium))
} else if pct < 90 {
r = append(r, inspector.Warn("system.memory", "Memory usage healthy", systemSub, node,
fmt.Sprintf("used=%dMB/%dMB (%.0f%%)", sys.MemUsedMB, sys.MemTotalMB, pct), inspector.High))
} else {
r = append(r, inspector.Fail("system.memory", "Memory usage healthy", systemSub, node,
fmt.Sprintf("used=%dMB/%dMB (%.0f%% CRITICAL)", sys.MemUsedMB, sys.MemTotalMB, pct), inspector.Critical))
}
}
// 6.15 Disk usage
if sys.DiskUsePct > 0 {
if sys.DiskUsePct < 80 {
r = append(r, inspector.Pass("system.disk", "Disk usage healthy", systemSub, node,
fmt.Sprintf("used=%s/%s (%d%%)", sys.DiskUsedGB, sys.DiskTotalGB, sys.DiskUsePct), inspector.High))
} else if sys.DiskUsePct < 90 {
r = append(r, inspector.Warn("system.disk", "Disk usage healthy", systemSub, node,
fmt.Sprintf("used=%s/%s (%d%%)", sys.DiskUsedGB, sys.DiskTotalGB, sys.DiskUsePct), inspector.High))
} else {
r = append(r, inspector.Fail("system.disk", "Disk usage healthy", systemSub, node,
fmt.Sprintf("used=%s/%s (%d%% CRITICAL)", sys.DiskUsedGB, sys.DiskTotalGB, sys.DiskUsePct), inspector.Critical))
}
}
// 6.17 Load average vs CPU count
if sys.LoadAvg != "" && sys.CPUCount > 0 {
parts := strings.Split(strings.TrimSpace(sys.LoadAvg), ",")
if len(parts) >= 1 {
load1, err := strconv.ParseFloat(strings.TrimSpace(parts[0]), 64)
if err == nil {
cpus := float64(sys.CPUCount)
if load1 < cpus {
r = append(r, inspector.Pass("system.load", "Load average healthy", systemSub, node,
fmt.Sprintf("load1=%.1f cpus=%d", load1, sys.CPUCount), inspector.Medium))
} else if load1 < cpus*2 {
r = append(r, inspector.Warn("system.load", "Load average healthy", systemSub, node,
fmt.Sprintf("load1=%.1f cpus=%d (elevated)", load1, sys.CPUCount), inspector.Medium))
} else {
r = append(r, inspector.Fail("system.load", "Load average healthy", systemSub, node,
fmt.Sprintf("load1=%.1f cpus=%d (overloaded)", load1, sys.CPUCount), inspector.High))
}
}
}
}
// 6.18 OOM kills
if sys.OOMKills == 0 {
r = append(r, inspector.Pass("system.oom", "No OOM kills", systemSub, node,
"no OOM kills in dmesg", inspector.Critical))
} else {
r = append(r, inspector.Fail("system.oom", "No OOM kills", systemSub, node,
fmt.Sprintf("%d OOM kills in dmesg", sys.OOMKills), inspector.Critical))
}
// 6.19 Swap usage
if sys.SwapTotalMB > 0 {
pct := float64(sys.SwapUsedMB) / float64(sys.SwapTotalMB) * 100
if pct < 30 {
r = append(r, inspector.Pass("system.swap", "Swap usage low", systemSub, node,
fmt.Sprintf("swap=%dMB/%dMB (%.0f%%)", sys.SwapUsedMB, sys.SwapTotalMB, pct), inspector.Medium))
} else {
r = append(r, inspector.Warn("system.swap", "Swap usage low", systemSub, node,
fmt.Sprintf("swap=%dMB/%dMB (%.0f%%)", sys.SwapUsedMB, sys.SwapTotalMB, pct), inspector.Medium))
}
}
// 6.20 Uptime
if sys.UptimeRaw != "" && sys.UptimeRaw != "unknown" {
r = append(r, inspector.Pass("system.uptime", "System uptime reported", systemSub, node,
fmt.Sprintf("up since %s", sys.UptimeRaw), inspector.Low))
}
// 6.21 Inode usage
if sys.InodePct > 0 {
if sys.InodePct < 80 {
r = append(r, inspector.Pass("system.inodes", "Inode usage healthy", systemSub, node,
fmt.Sprintf("inode_use=%d%%", sys.InodePct), inspector.High))
} else if sys.InodePct < 95 {
r = append(r, inspector.Warn("system.inodes", "Inode usage healthy", systemSub, node,
fmt.Sprintf("inode_use=%d%% (elevated)", sys.InodePct), inspector.High))
} else {
r = append(r, inspector.Fail("system.inodes", "Inode usage healthy", systemSub, node,
fmt.Sprintf("inode_use=%d%% (CRITICAL)", sys.InodePct), inspector.Critical))
}
}
// 6.22 UFW firewall
if sys.UFWActive {
r = append(r, inspector.Pass("system.ufw", "UFW firewall active", systemSub, node,
"ufw is active", inspector.High))
} else {
r = append(r, inspector.Warn("system.ufw", "UFW firewall active", systemSub, node,
"ufw is not active", inspector.High))
}
// 6.23 Process user
if sys.ProcessUser != "" && sys.ProcessUser != "unknown" {
if sys.ProcessUser == "debros" {
r = append(r, inspector.Pass("system.process_user", "debros-node runs as correct user", systemSub, node,
"user=debros", inspector.High))
} else if sys.ProcessUser == "root" {
r = append(r, inspector.Warn("system.process_user", "debros-node runs as correct user", systemSub, node,
"user=root (should be debros)", inspector.High))
} else {
r = append(r, inspector.Warn("system.process_user", "debros-node runs as correct user", systemSub, node,
fmt.Sprintf("user=%s (expected debros)", sys.ProcessUser), inspector.Medium))
}
}
// 6.24 Panic/fatal in logs
if sys.PanicCount == 0 {
r = append(r, inspector.Pass("system.panics", "No panics in recent logs", systemSub, node,
"0 panic/fatal in last hour", inspector.Critical))
} else {
r = append(r, inspector.Fail("system.panics", "No panics in recent logs", systemSub, node,
fmt.Sprintf("%d panic/fatal in last hour", sys.PanicCount), inspector.Critical))
}
// 6.25 Expected ports listening
expectedPorts := map[int]string{
5001: "RQLite HTTP",
3322: "Olric Memberlist",
6001: "Gateway",
4501: "IPFS API",
}
for port, svcName := range expectedPorts {
found := false
for _, p := range sys.ListeningPorts {
if p == port {
found = true
break
}
}
if found {
r = append(r, inspector.Pass(
fmt.Sprintf("system.port_%d", port),
fmt.Sprintf("%s port %d listening", svcName, port),
systemSub, node, "port is bound", inspector.High))
} else {
r = append(r, inspector.Warn(
fmt.Sprintf("system.port_%d", port),
fmt.Sprintf("%s port %d listening", svcName, port),
systemSub, node, "port is NOT bound", inspector.High))
}
}
return r
}

View File

@ -0,0 +1,284 @@
package checks
import (
"testing"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func TestCheckSystem_HealthyNode(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{
Services: map[string]string{
"debros-node": "active",
"debros-olric": "active",
"debros-ipfs": "active",
"debros-ipfs-cluster": "active",
"wg-quick@wg0": "active",
},
FailedUnits: nil,
MemTotalMB: 8192,
MemUsedMB: 4096,
DiskUsePct: 50,
DiskUsedGB: "25G",
DiskTotalGB: "50G",
LoadAvg: "1.0, 0.8, 0.5",
CPUCount: 4,
OOMKills: 0,
SwapTotalMB: 2048,
SwapUsedMB: 100,
UptimeRaw: "2024-01-01 00:00:00",
InodePct: 10,
ListeningPorts: []int{5001, 3322, 6001, 4501},
UFWActive: true,
ProcessUser: "debros",
PanicCount: 0,
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.svc_debros_node", inspector.StatusPass)
expectStatus(t, results, "system.svc_debros_olric", inspector.StatusPass)
expectStatus(t, results, "system.svc_debros_ipfs", inspector.StatusPass)
expectStatus(t, results, "system.svc_debros_ipfs_cluster", inspector.StatusPass)
expectStatus(t, results, "system.svc_wg", inspector.StatusPass)
expectStatus(t, results, "system.no_failed_units", inspector.StatusPass)
expectStatus(t, results, "system.memory", inspector.StatusPass)
expectStatus(t, results, "system.disk", inspector.StatusPass)
expectStatus(t, results, "system.load", inspector.StatusPass)
expectStatus(t, results, "system.oom", inspector.StatusPass)
expectStatus(t, results, "system.swap", inspector.StatusPass)
expectStatus(t, results, "system.inodes", inspector.StatusPass)
expectStatus(t, results, "system.ufw", inspector.StatusPass)
expectStatus(t, results, "system.process_user", inspector.StatusPass)
expectStatus(t, results, "system.panics", inspector.StatusPass)
expectStatus(t, results, "system.port_5001", inspector.StatusPass)
expectStatus(t, results, "system.port_3322", inspector.StatusPass)
expectStatus(t, results, "system.port_6001", inspector.StatusPass)
expectStatus(t, results, "system.port_4501", inspector.StatusPass)
}
func TestCheckSystem_ServiceInactive(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{
Services: map[string]string{
"debros-node": "active",
"debros-olric": "inactive",
"debros-ipfs": "active",
"debros-ipfs-cluster": "failed",
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.svc_debros_node", inspector.StatusPass)
expectStatus(t, results, "system.svc_debros_olric", inspector.StatusFail)
expectStatus(t, results, "system.svc_debros_ipfs_cluster", inspector.StatusFail)
}
func TestCheckSystem_NameserverServices(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
nd.System = &inspector.SystemData{
Services: map[string]string{
"debros-node": "active",
"debros-olric": "active",
"debros-ipfs": "active",
"debros-ipfs-cluster": "active",
"coredns": "active",
"caddy": "active",
},
}
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.svc_coredns", inspector.StatusPass)
expectStatus(t, results, "system.svc_caddy", inspector.StatusPass)
}
func TestCheckSystem_NameserverServicesNotCheckedOnRegularNode(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{
Services: map[string]string{
"debros-node": "active",
"debros-olric": "active",
"debros-ipfs": "active",
"debros-ipfs-cluster": "active",
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
if findCheck(results, "system.svc_coredns") != nil {
t.Error("should not check coredns on regular node")
}
}
func TestCheckSystem_FailedUnits(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{
Services: map[string]string{},
FailedUnits: []string{"some-service.service"},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.no_failed_units", inspector.StatusFail)
}
func TestCheckSystem_Memory(t *testing.T) {
tests := []struct {
name string
used int
total int
status inspector.Status
}{
{"healthy", 4000, 8000, inspector.StatusPass}, // 50%
{"elevated", 7000, 8000, inspector.StatusWarn}, // 87.5%
{"critical", 7500, 8000, inspector.StatusFail}, // 93.75%
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{
Services: map[string]string{},
MemTotalMB: tt.total,
MemUsedMB: tt.used,
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.memory", tt.status)
})
}
}
func TestCheckSystem_Disk(t *testing.T) {
tests := []struct {
name string
pct int
status inspector.Status
}{
{"healthy", 60, inspector.StatusPass},
{"elevated", 85, inspector.StatusWarn},
{"critical", 92, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{
Services: map[string]string{},
DiskUsePct: tt.pct,
DiskUsedGB: "25G",
DiskTotalGB: "50G",
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.disk", tt.status)
})
}
}
func TestCheckSystem_Load(t *testing.T) {
tests := []struct {
name string
load string
cpus int
status inspector.Status
}{
{"healthy", "1.0, 0.8, 0.5", 4, inspector.StatusPass},
{"elevated", "6.0, 5.0, 4.0", 4, inspector.StatusWarn},
{"overloaded", "10.0, 9.0, 8.0", 4, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{
Services: map[string]string{},
LoadAvg: tt.load,
CPUCount: tt.cpus,
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.load", tt.status)
})
}
}
func TestCheckSystem_OOMKills(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{Services: map[string]string{}, OOMKills: 3}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.oom", inspector.StatusFail)
}
func TestCheckSystem_Inodes(t *testing.T) {
tests := []struct {
name string
pct int
status inspector.Status
}{
{"healthy", 50, inspector.StatusPass},
{"elevated", 82, inspector.StatusWarn},
{"critical", 96, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{Services: map[string]string{}, InodePct: tt.pct}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.inodes", tt.status)
})
}
}
func TestCheckSystem_ProcessUser(t *testing.T) {
tests := []struct {
name string
user string
status inspector.Status
}{
{"correct", "debros", inspector.StatusPass},
{"root", "root", inspector.StatusWarn},
{"other", "ubuntu", inspector.StatusWarn},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{Services: map[string]string{}, ProcessUser: tt.user}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.process_user", tt.status)
})
}
}
func TestCheckSystem_Panics(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{Services: map[string]string{}, PanicCount: 5}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.panics", inspector.StatusFail)
}
func TestCheckSystem_ExpectedPorts(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{
Services: map[string]string{},
ListeningPorts: []int{5001, 6001}, // Missing 3322, 4501
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.port_5001", inspector.StatusPass)
expectStatus(t, results, "system.port_6001", inspector.StatusPass)
expectStatus(t, results, "system.port_3322", inspector.StatusWarn)
expectStatus(t, results, "system.port_4501", inspector.StatusWarn)
}
func TestCheckSystem_NilData(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
if len(results) != 0 {
t.Errorf("expected 0 results for nil System data, got %d", len(results))
}
}

View File

@ -0,0 +1,270 @@
package checks
import (
"fmt"
"strings"
"time"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func init() {
inspector.RegisterChecker("wireguard", CheckWireGuard)
}
const wgSub = "wireguard"
// CheckWireGuard runs all WireGuard health checks.
func CheckWireGuard(data *inspector.ClusterData) []inspector.CheckResult {
var results []inspector.CheckResult
for _, nd := range data.Nodes {
if nd.WireGuard == nil {
continue
}
results = append(results, checkWGPerNode(nd, data)...)
}
results = append(results, checkWGCrossNode(data)...)
return results
}
func checkWGPerNode(nd *inspector.NodeData, data *inspector.ClusterData) []inspector.CheckResult {
var r []inspector.CheckResult
wg := nd.WireGuard
node := nd.Node.Name()
// 5.1 Interface up
if wg.InterfaceUp {
r = append(r, inspector.Pass("wg.interface_up", "WireGuard interface up", wgSub, node,
fmt.Sprintf("wg0 up, IP=%s", wg.WgIP), inspector.Critical))
} else {
r = append(r, inspector.Fail("wg.interface_up", "WireGuard interface up", wgSub, node,
"wg0 interface is DOWN", inspector.Critical))
return r
}
// 5.2 Service active
if wg.ServiceActive {
r = append(r, inspector.Pass("wg.service_active", "wg-quick@wg0 service active", wgSub, node,
"service is active", inspector.Critical))
} else {
r = append(r, inspector.Warn("wg.service_active", "wg-quick@wg0 service active", wgSub, node,
"service not active (interface up but service not managed by systemd?)", inspector.High))
}
// 5.5 Correct IP in 10.0.0.0/24
if wg.WgIP != "" && strings.HasPrefix(wg.WgIP, "10.0.0.") {
r = append(r, inspector.Pass("wg.correct_ip", "WG IP in expected range", wgSub, node,
fmt.Sprintf("IP=%s (10.0.0.0/24)", wg.WgIP), inspector.Critical))
} else if wg.WgIP != "" {
r = append(r, inspector.Warn("wg.correct_ip", "WG IP in expected range", wgSub, node,
fmt.Sprintf("IP=%s (not in 10.0.0.0/24)", wg.WgIP), inspector.High))
}
// 5.4 Listen port
if wg.ListenPort == 51820 {
r = append(r, inspector.Pass("wg.listen_port", "Listen port is 51820", wgSub, node,
"port=51820", inspector.Critical))
} else if wg.ListenPort > 0 {
r = append(r, inspector.Warn("wg.listen_port", "Listen port is 51820", wgSub, node,
fmt.Sprintf("port=%d (expected 51820)", wg.ListenPort), inspector.High))
}
// 5.7 Peer count
expectedNodes := countWGNodes(data)
expectedPeers := expectedNodes - 1
if expectedPeers < 0 {
expectedPeers = 0
}
if wg.PeerCount >= expectedPeers {
r = append(r, inspector.Pass("wg.peer_count", "Peer count matches expected", wgSub, node,
fmt.Sprintf("peers=%d (expected=%d)", wg.PeerCount, expectedPeers), inspector.High))
} else if wg.PeerCount > 0 {
r = append(r, inspector.Warn("wg.peer_count", "Peer count matches expected", wgSub, node,
fmt.Sprintf("peers=%d (expected=%d)", wg.PeerCount, expectedPeers), inspector.High))
} else {
r = append(r, inspector.Fail("wg.peer_count", "Peer count matches expected", wgSub, node,
fmt.Sprintf("peers=%d (isolated!)", wg.PeerCount), inspector.Critical))
}
// 5.29 MTU
if wg.MTU == 1420 {
r = append(r, inspector.Pass("wg.mtu", "MTU is 1420", wgSub, node,
"MTU=1420", inspector.High))
} else if wg.MTU > 0 {
r = append(r, inspector.Warn("wg.mtu", "MTU is 1420", wgSub, node,
fmt.Sprintf("MTU=%d (expected 1420)", wg.MTU), inspector.High))
}
// 5.35 Config file exists
if wg.ConfigExists {
r = append(r, inspector.Pass("wg.config_exists", "Config file exists", wgSub, node,
"/etc/wireguard/wg0.conf present", inspector.High))
} else {
r = append(r, inspector.Warn("wg.config_exists", "Config file exists", wgSub, node,
"/etc/wireguard/wg0.conf NOT found", inspector.High))
}
// 5.36 Config permissions
if wg.ConfigPerms == "600" {
r = append(r, inspector.Pass("wg.config_perms", "Config file permissions 600", wgSub, node,
"perms=600", inspector.Critical))
} else if wg.ConfigPerms != "" && wg.ConfigPerms != "000" {
r = append(r, inspector.Warn("wg.config_perms", "Config file permissions 600", wgSub, node,
fmt.Sprintf("perms=%s (expected 600)", wg.ConfigPerms), inspector.Critical))
}
// Per-peer checks
now := time.Now().Unix()
neverHandshaked := 0
staleHandshakes := 0
noTraffic := 0
for _, peer := range wg.Peers {
// 5.20 Each peer has exactly one /32 allowed IP
if !strings.Contains(peer.AllowedIPs, "/32") {
r = append(r, inspector.Warn("wg.peer_allowed_ip", "Peer has /32 allowed IP", wgSub, node,
fmt.Sprintf("peer %s...%s has allowed_ips=%s", peer.PublicKey[:8], peer.PublicKey[len(peer.PublicKey)-4:], peer.AllowedIPs), inspector.High))
}
// 5.23 No peer has 0.0.0.0/0
if strings.Contains(peer.AllowedIPs, "0.0.0.0/0") {
r = append(r, inspector.Fail("wg.peer_catch_all", "No catch-all route peer", wgSub, node,
fmt.Sprintf("peer %s...%s has 0.0.0.0/0 (route hijack!)", peer.PublicKey[:8], peer.PublicKey[len(peer.PublicKey)-4:]), inspector.Critical))
}
// 5.11-5.12 Handshake freshness
if peer.LatestHandshake == 0 {
neverHandshaked++
} else {
age := now - peer.LatestHandshake
if age > 300 {
staleHandshakes++
}
}
// 5.13 Transfer stats
if peer.TransferRx == 0 && peer.TransferTx == 0 {
noTraffic++
}
}
if len(wg.Peers) > 0 {
// 5.12 Never handshaked
if neverHandshaked == 0 {
r = append(r, inspector.Pass("wg.handshake_all", "All peers have handshaked", wgSub, node,
fmt.Sprintf("%d/%d peers handshaked", len(wg.Peers), len(wg.Peers)), inspector.Critical))
} else {
r = append(r, inspector.Fail("wg.handshake_all", "All peers have handshaked", wgSub, node,
fmt.Sprintf("%d/%d peers never handshaked", neverHandshaked, len(wg.Peers)), inspector.Critical))
}
// 5.11 Stale handshakes
if staleHandshakes == 0 {
r = append(r, inspector.Pass("wg.handshake_fresh", "All handshakes recent (<5m)", wgSub, node,
"all handshakes within 5 minutes", inspector.High))
} else {
r = append(r, inspector.Warn("wg.handshake_fresh", "All handshakes recent (<5m)", wgSub, node,
fmt.Sprintf("%d/%d peers with stale handshake (>5m)", staleHandshakes, len(wg.Peers)), inspector.High))
}
// 5.13 Transfer
if noTraffic == 0 {
r = append(r, inspector.Pass("wg.peer_traffic", "All peers have traffic", wgSub, node,
fmt.Sprintf("%d/%d peers with traffic", len(wg.Peers), len(wg.Peers)), inspector.High))
} else {
r = append(r, inspector.Warn("wg.peer_traffic", "All peers have traffic", wgSub, node,
fmt.Sprintf("%d/%d peers with zero traffic", noTraffic, len(wg.Peers)), inspector.High))
}
}
return r
}
func checkWGCrossNode(data *inspector.ClusterData) []inspector.CheckResult {
var r []inspector.CheckResult
type nodeInfo struct {
name string
wg *inspector.WireGuardData
}
var nodes []nodeInfo
for _, nd := range data.Nodes {
if nd.WireGuard != nil && nd.WireGuard.InterfaceUp {
nodes = append(nodes, nodeInfo{name: nd.Node.Name(), wg: nd.WireGuard})
}
}
if len(nodes) < 2 {
return r
}
// 5.8 Peer count consistent
counts := map[int]int{}
for _, n := range nodes {
counts[n.wg.PeerCount]++
}
if len(counts) == 1 {
for c := range counts {
r = append(r, inspector.Pass("wg.peer_count_consistent", "Peer count consistent across nodes", wgSub, "",
fmt.Sprintf("all nodes have %d peers", c), inspector.High))
}
} else {
var parts []string
for c, num := range counts {
parts = append(parts, fmt.Sprintf("%d nodes have %d peers", num, c))
}
r = append(r, inspector.Warn("wg.peer_count_consistent", "Peer count consistent across nodes", wgSub, "",
strings.Join(parts, "; "), inspector.High))
}
// 5.30 MTU consistent
mtus := map[int]int{}
for _, n := range nodes {
if n.wg.MTU > 0 {
mtus[n.wg.MTU]++
}
}
if len(mtus) == 1 {
for m := range mtus {
r = append(r, inspector.Pass("wg.mtu_consistent", "MTU consistent across nodes", wgSub, "",
fmt.Sprintf("all nodes MTU=%d", m), inspector.High))
}
} else if len(mtus) > 1 {
r = append(r, inspector.Warn("wg.mtu_consistent", "MTU consistent across nodes", wgSub, "",
fmt.Sprintf("%d different MTU values", len(mtus)), inspector.High))
}
// 5.50 Public key uniqueness
allKeys := map[string][]string{}
for _, n := range nodes {
for _, peer := range n.wg.Peers {
allKeys[peer.PublicKey] = append(allKeys[peer.PublicKey], n.name)
}
}
dupeKeys := 0
for _, names := range allKeys {
if len(names) > len(nodes)-1 {
dupeKeys++
}
}
// If all good, the same key should appear at most N-1 times (once per other node)
if dupeKeys == 0 {
r = append(r, inspector.Pass("wg.key_uniqueness", "Public keys unique across nodes", wgSub, "",
fmt.Sprintf("%d unique peer keys", len(allKeys)), inspector.Critical))
}
return r
}
func countWGNodes(data *inspector.ClusterData) int {
count := 0
for _, nd := range data.Nodes {
if nd.WireGuard != nil {
count++
}
}
return count
}

View File

@ -0,0 +1,230 @@
package checks
import (
"testing"
"time"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func TestCheckWireGuard_InterfaceDown(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.WireGuard = &inspector.WireGuardData{InterfaceUp: false}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckWireGuard(data)
expectStatus(t, results, "wg.interface_up", inspector.StatusFail)
// Early return — no further per-node checks
if findCheck(results, "wg.service_active") != nil {
t.Error("should not check service_active when interface down")
}
}
func TestCheckWireGuard_HealthyNode(t *testing.T) {
now := time.Now().Unix()
nd := makeNodeData("1.1.1.1", "node")
nd.WireGuard = &inspector.WireGuardData{
InterfaceUp: true,
ServiceActive: true,
WgIP: "10.0.0.1",
ListenPort: 51820,
PeerCount: 2,
MTU: 1420,
ConfigExists: true,
ConfigPerms: "600",
Peers: []inspector.WGPeer{
{PublicKey: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=", AllowedIPs: "10.0.0.2/32", LatestHandshake: now - 30, TransferRx: 1000, TransferTx: 2000},
{PublicKey: "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB=", AllowedIPs: "10.0.0.3/32", LatestHandshake: now - 60, TransferRx: 500, TransferTx: 800},
},
}
// Single-node for per-node assertions (avoids helper node interference)
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckWireGuard(data)
expectStatus(t, results, "wg.interface_up", inspector.StatusPass)
expectStatus(t, results, "wg.service_active", inspector.StatusPass)
expectStatus(t, results, "wg.correct_ip", inspector.StatusPass)
expectStatus(t, results, "wg.listen_port", inspector.StatusPass)
expectStatus(t, results, "wg.mtu", inspector.StatusPass)
expectStatus(t, results, "wg.config_exists", inspector.StatusPass)
expectStatus(t, results, "wg.config_perms", inspector.StatusPass)
expectStatus(t, results, "wg.handshake_all", inspector.StatusPass)
expectStatus(t, results, "wg.handshake_fresh", inspector.StatusPass)
expectStatus(t, results, "wg.peer_traffic", inspector.StatusPass)
}
func TestCheckWireGuard_WrongIP(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.WireGuard = &inspector.WireGuardData{
InterfaceUp: true,
WgIP: "192.168.1.5",
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckWireGuard(data)
expectStatus(t, results, "wg.correct_ip", inspector.StatusWarn)
}
func TestCheckWireGuard_WrongPort(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.WireGuard = &inspector.WireGuardData{
InterfaceUp: true,
WgIP: "10.0.0.1",
ListenPort: 12345,
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckWireGuard(data)
expectStatus(t, results, "wg.listen_port", inspector.StatusWarn)
}
func TestCheckWireGuard_PeerCountMismatch(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.WireGuard = &inspector.WireGuardData{InterfaceUp: true, WgIP: "10.0.0.1", PeerCount: 1}
nodes := map[string]*inspector.NodeData{"1.1.1.1": nd}
for _, host := range []string{"2.2.2.2", "3.3.3.3", "4.4.4.4"} {
other := makeNodeData(host, "node")
other.WireGuard = &inspector.WireGuardData{InterfaceUp: true, PeerCount: 3}
nodes[host] = other
}
data := makeCluster(nodes)
results := CheckWireGuard(data)
// Node 1.1.1.1 has 1 peer but expects 3 (4 nodes - 1)
c := findCheck(results, "wg.peer_count")
if c == nil {
t.Fatal("expected wg.peer_count check")
}
// At least one node should have a warn
hasWarn := false
for _, r := range results {
if r.ID == "wg.peer_count" && r.Status == inspector.StatusWarn {
hasWarn = true
}
}
if !hasWarn {
t.Error("expected at least one wg.peer_count warn for mismatched peer count")
}
}
func TestCheckWireGuard_ZeroPeers(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.WireGuard = &inspector.WireGuardData{InterfaceUp: true, WgIP: "10.0.0.1", PeerCount: 0}
nodes := map[string]*inspector.NodeData{"1.1.1.1": nd}
for _, host := range []string{"2.2.2.2", "3.3.3.3"} {
other := makeNodeData(host, "node")
other.WireGuard = &inspector.WireGuardData{InterfaceUp: true, PeerCount: 2}
nodes[host] = other
}
data := makeCluster(nodes)
results := CheckWireGuard(data)
// At least one node should fail (zero peers = isolated)
hasFail := false
for _, r := range results {
if r.ID == "wg.peer_count" && r.Status == inspector.StatusFail {
hasFail = true
}
}
if !hasFail {
t.Error("expected wg.peer_count fail for isolated node")
}
}
func TestCheckWireGuard_StaleHandshakes(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.WireGuard = &inspector.WireGuardData{
InterfaceUp: true,
WgIP: "10.0.0.1",
PeerCount: 2,
Peers: []inspector.WGPeer{
{PublicKey: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=", AllowedIPs: "10.0.0.2/32", LatestHandshake: time.Now().Unix() - 600, TransferRx: 100, TransferTx: 200},
{PublicKey: "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB=", AllowedIPs: "10.0.0.3/32", LatestHandshake: time.Now().Unix() - 600, TransferRx: 100, TransferTx: 200},
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckWireGuard(data)
expectStatus(t, results, "wg.handshake_fresh", inspector.StatusWarn)
}
func TestCheckWireGuard_NeverHandshaked(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.WireGuard = &inspector.WireGuardData{
InterfaceUp: true,
WgIP: "10.0.0.1",
PeerCount: 1,
Peers: []inspector.WGPeer{
{PublicKey: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=", AllowedIPs: "10.0.0.2/32", LatestHandshake: 0},
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckWireGuard(data)
expectStatus(t, results, "wg.handshake_all", inspector.StatusFail)
}
func TestCheckWireGuard_NoTraffic(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.WireGuard = &inspector.WireGuardData{
InterfaceUp: true,
WgIP: "10.0.0.1",
PeerCount: 1,
Peers: []inspector.WGPeer{
{PublicKey: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=", AllowedIPs: "10.0.0.2/32", LatestHandshake: time.Now().Unix(), TransferRx: 0, TransferTx: 0},
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckWireGuard(data)
expectStatus(t, results, "wg.peer_traffic", inspector.StatusWarn)
}
func TestCheckWireGuard_CatchAllRoute(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.WireGuard = &inspector.WireGuardData{
InterfaceUp: true,
WgIP: "10.0.0.1",
PeerCount: 1,
Peers: []inspector.WGPeer{
{PublicKey: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=", AllowedIPs: "0.0.0.0/0", LatestHandshake: time.Now().Unix(), TransferRx: 100, TransferTx: 200},
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckWireGuard(data)
expectStatus(t, results, "wg.peer_catch_all", inspector.StatusFail)
}
func TestCheckWireGuard_CrossNode_PeerCountConsistent(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
for _, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
nd := makeNodeData(host, "node")
nd.WireGuard = &inspector.WireGuardData{InterfaceUp: true, PeerCount: 2, MTU: 1420}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckWireGuard(data)
expectStatus(t, results, "wg.peer_count_consistent", inspector.StatusPass)
expectStatus(t, results, "wg.mtu_consistent", inspector.StatusPass)
}
func TestCheckWireGuard_CrossNode_PeerCountInconsistent(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
counts := []int{2, 2, 1}
for i, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
nd := makeNodeData(host, "node")
nd.WireGuard = &inspector.WireGuardData{InterfaceUp: true, PeerCount: counts[i], MTU: 1420}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckWireGuard(data)
expectStatus(t, results, "wg.peer_count_consistent", inspector.StatusWarn)
}
func TestCheckWireGuard_NilData(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckWireGuard(data)
if len(results) != 0 {
t.Errorf("expected 0 results for nil WireGuard data, got %d", len(results))
}
}

1268
pkg/inspector/collector.go Normal file

File diff suppressed because it is too large Load Diff

118
pkg/inspector/config.go Normal file
View File

@ -0,0 +1,118 @@
package inspector
import (
"bufio"
"fmt"
"os"
"strings"
)
// Node represents a remote node parsed from remote-nodes.conf.
type Node struct {
Environment string // devnet, testnet
User string // SSH user
Host string // IP or hostname
Password string // SSH password
Role string // node, nameserver-ns1, nameserver-ns2, nameserver-ns3
SSHKey string // optional path to SSH key
}
// Name returns a short display name for the node (user@host).
func (n Node) Name() string {
return fmt.Sprintf("%s@%s", n.User, n.Host)
}
// IsNameserver returns true if the node has a nameserver role.
func (n Node) IsNameserver() bool {
return strings.HasPrefix(n.Role, "nameserver")
}
// LoadNodes parses a remote-nodes.conf file into a slice of Nodes.
// Format: environment|user@host|password|role|ssh_key (ssh_key optional)
func LoadNodes(path string) ([]Node, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("open config: %w", err)
}
defer f.Close()
var nodes []Node
scanner := bufio.NewScanner(f)
lineNum := 0
for scanner.Scan() {
lineNum++
line := strings.TrimSpace(scanner.Text())
if line == "" || strings.HasPrefix(line, "#") {
continue
}
parts := strings.SplitN(line, "|", 5)
if len(parts) < 4 {
return nil, fmt.Errorf("line %d: expected at least 4 pipe-delimited fields, got %d", lineNum, len(parts))
}
env := parts[0]
userHost := parts[1]
password := parts[2]
role := parts[3]
var sshKey string
if len(parts) == 5 {
sshKey = parts[4]
}
// Parse user@host
at := strings.LastIndex(userHost, "@")
if at < 0 {
return nil, fmt.Errorf("line %d: expected user@host format, got %q", lineNum, userHost)
}
user := userHost[:at]
host := userHost[at+1:]
nodes = append(nodes, Node{
Environment: env,
User: user,
Host: host,
Password: password,
Role: role,
SSHKey: sshKey,
})
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("reading config: %w", err)
}
return nodes, nil
}
// FilterByEnv returns only nodes matching the given environment.
func FilterByEnv(nodes []Node, env string) []Node {
var filtered []Node
for _, n := range nodes {
if n.Environment == env {
filtered = append(filtered, n)
}
}
return filtered
}
// FilterByRole returns only nodes matching the given role prefix.
func FilterByRole(nodes []Node, rolePrefix string) []Node {
var filtered []Node
for _, n := range nodes {
if strings.HasPrefix(n.Role, rolePrefix) {
filtered = append(filtered, n)
}
}
return filtered
}
// RegularNodes returns non-nameserver nodes.
func RegularNodes(nodes []Node) []Node {
var filtered []Node
for _, n := range nodes {
if n.Role == "node" {
filtered = append(filtered, n)
}
}
return filtered
}

View File

@ -0,0 +1,179 @@
package inspector
import (
"os"
"path/filepath"
"testing"
)
func TestLoadNodes(t *testing.T) {
content := `# Comment line
devnet|ubuntu@1.2.3.4|pass123|node
devnet|ubuntu@1.2.3.5|pass456|node
devnet|ubuntu@5.6.7.8|pass789|nameserver-ns1|/path/to/key
`
path := writeTempFile(t, content)
nodes, err := LoadNodes(path)
if err != nil {
t.Fatalf("LoadNodes: %v", err)
}
if len(nodes) != 3 {
t.Fatalf("want 3 nodes, got %d", len(nodes))
}
// First node
n := nodes[0]
if n.Environment != "devnet" {
t.Errorf("node[0].Environment = %q, want devnet", n.Environment)
}
if n.User != "ubuntu" {
t.Errorf("node[0].User = %q, want ubuntu", n.User)
}
if n.Host != "1.2.3.4" {
t.Errorf("node[0].Host = %q, want 1.2.3.4", n.Host)
}
if n.Password != "pass123" {
t.Errorf("node[0].Password = %q, want pass123", n.Password)
}
if n.Role != "node" {
t.Errorf("node[0].Role = %q, want node", n.Role)
}
if n.SSHKey != "" {
t.Errorf("node[0].SSHKey = %q, want empty", n.SSHKey)
}
// Third node with SSH key
n3 := nodes[2]
if n3.Role != "nameserver-ns1" {
t.Errorf("node[2].Role = %q, want nameserver-ns1", n3.Role)
}
if n3.SSHKey != "/path/to/key" {
t.Errorf("node[2].SSHKey = %q, want /path/to/key", n3.SSHKey)
}
}
func TestLoadNodes_EmptyLines(t *testing.T) {
content := `
# Full line comment
devnet|ubuntu@1.2.3.4|pass|node
# Another comment
devnet|ubuntu@1.2.3.5|pass|node
`
path := writeTempFile(t, content)
nodes, err := LoadNodes(path)
if err != nil {
t.Fatalf("LoadNodes: %v", err)
}
if len(nodes) != 2 {
t.Fatalf("want 2 nodes (blank/comment lines skipped), got %d", len(nodes))
}
}
func TestLoadNodes_InvalidFormat(t *testing.T) {
tests := []struct {
name string
content string
}{
{"too few fields", "devnet|ubuntu@1.2.3.4|pass\n"},
{"no @ in userhost", "devnet|localhost|pass|node\n"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
path := writeTempFile(t, tt.content)
_, err := LoadNodes(path)
if err == nil {
t.Error("expected error for invalid format")
}
})
}
}
func TestLoadNodes_FileNotFound(t *testing.T) {
_, err := LoadNodes("/nonexistent/path/file.conf")
if err == nil {
t.Error("expected error for nonexistent file")
}
}
func TestFilterByEnv(t *testing.T) {
nodes := []Node{
{Environment: "devnet", Host: "1.1.1.1"},
{Environment: "testnet", Host: "2.2.2.2"},
{Environment: "devnet", Host: "3.3.3.3"},
}
filtered := FilterByEnv(nodes, "devnet")
if len(filtered) != 2 {
t.Fatalf("want 2 devnet nodes, got %d", len(filtered))
}
for _, n := range filtered {
if n.Environment != "devnet" {
t.Errorf("got env=%s, want devnet", n.Environment)
}
}
}
func TestFilterByRole(t *testing.T) {
nodes := []Node{
{Role: "node", Host: "1.1.1.1"},
{Role: "nameserver-ns1", Host: "2.2.2.2"},
{Role: "nameserver-ns2", Host: "3.3.3.3"},
{Role: "node", Host: "4.4.4.4"},
}
filtered := FilterByRole(nodes, "nameserver")
if len(filtered) != 2 {
t.Fatalf("want 2 nameserver nodes, got %d", len(filtered))
}
}
func TestRegularNodes(t *testing.T) {
nodes := []Node{
{Role: "node", Host: "1.1.1.1"},
{Role: "nameserver-ns1", Host: "2.2.2.2"},
{Role: "node", Host: "3.3.3.3"},
}
regular := RegularNodes(nodes)
if len(regular) != 2 {
t.Fatalf("want 2 regular nodes, got %d", len(regular))
}
}
func TestNode_Name(t *testing.T) {
n := Node{User: "ubuntu", Host: "1.2.3.4"}
if got := n.Name(); got != "ubuntu@1.2.3.4" {
t.Errorf("Name() = %q, want ubuntu@1.2.3.4", got)
}
}
func TestNode_IsNameserver(t *testing.T) {
tests := []struct {
role string
want bool
}{
{"nameserver-ns1", true},
{"nameserver-ns2", true},
{"node", false},
{"", false},
}
for _, tt := range tests {
t.Run(tt.role, func(t *testing.T) {
n := Node{Role: tt.role}
if got := n.IsNameserver(); got != tt.want {
t.Errorf("IsNameserver(%q) = %v, want %v", tt.role, got, tt.want)
}
})
}
}
func writeTempFile(t *testing.T, content string) string {
t.Helper()
dir := t.TempDir()
path := filepath.Join(dir, "test-nodes.conf")
if err := os.WriteFile(path, []byte(content), 0644); err != nil {
t.Fatalf("write temp file: %v", err)
}
return path
}

136
pkg/inspector/report.go Normal file
View File

@ -0,0 +1,136 @@
package inspector
import (
"encoding/json"
"fmt"
"io"
"sort"
"strings"
)
// PrintTable writes a human-readable table of check results.
func PrintTable(results *Results, w io.Writer) {
if len(results.Checks) == 0 {
fmt.Fprintf(w, "No checks executed.\n")
return
}
// Sort: failures first, then warnings, then passes, then skips.
// Within each group, sort by severity (critical first).
sorted := make([]CheckResult, len(results.Checks))
copy(sorted, results.Checks)
sort.Slice(sorted, func(i, j int) bool {
oi, oj := statusOrder(sorted[i].Status), statusOrder(sorted[j].Status)
if oi != oj {
return oi < oj
}
// Higher severity first
if sorted[i].Severity != sorted[j].Severity {
return sorted[i].Severity > sorted[j].Severity
}
return sorted[i].ID < sorted[j].ID
})
// Group by subsystem
groups := map[string][]CheckResult{}
var subsystems []string
for _, c := range sorted {
if _, exists := groups[c.Subsystem]; !exists {
subsystems = append(subsystems, c.Subsystem)
}
groups[c.Subsystem] = append(groups[c.Subsystem], c)
}
for _, sub := range subsystems {
checks := groups[sub]
fmt.Fprintf(w, "\n%s %s\n", severityIcon(Critical), strings.ToUpper(sub))
fmt.Fprintf(w, "%s\n", strings.Repeat("-", 70))
for _, c := range checks {
icon := statusIcon(c.Status)
sev := fmt.Sprintf("[%s]", c.Severity)
nodePart := ""
if c.Node != "" {
nodePart = fmt.Sprintf(" (%s)", c.Node)
}
fmt.Fprintf(w, " %s %-8s %s%s\n", icon, sev, c.Name, nodePart)
if c.Message != "" {
fmt.Fprintf(w, " %s\n", c.Message)
}
}
}
passed, failed, warned, skipped := results.Summary()
fmt.Fprintf(w, "\n%s\n", strings.Repeat("=", 70))
fmt.Fprintf(w, "Summary: %d passed, %d failed, %d warnings, %d skipped (%.1fs)\n",
passed, failed, warned, skipped, results.Duration.Seconds())
}
// PrintJSON writes check results as JSON.
func PrintJSON(results *Results, w io.Writer) {
passed, failed, warned, skipped := results.Summary()
output := struct {
Summary struct {
Passed int `json:"passed"`
Failed int `json:"failed"`
Warned int `json:"warned"`
Skipped int `json:"skipped"`
Total int `json:"total"`
Seconds float64 `json:"duration_seconds"`
} `json:"summary"`
Checks []CheckResult `json:"checks"`
}{
Checks: results.Checks,
}
output.Summary.Passed = passed
output.Summary.Failed = failed
output.Summary.Warned = warned
output.Summary.Skipped = skipped
output.Summary.Total = len(results.Checks)
output.Summary.Seconds = results.Duration.Seconds()
enc := json.NewEncoder(w)
enc.SetIndent("", " ")
enc.Encode(output)
}
// SummaryLine returns a one-line summary string.
func SummaryLine(results *Results) string {
passed, failed, warned, skipped := results.Summary()
return fmt.Sprintf("%d passed, %d failed, %d warnings, %d skipped",
passed, failed, warned, skipped)
}
func statusOrder(s Status) int {
switch s {
case StatusFail:
return 0
case StatusWarn:
return 1
case StatusPass:
return 2
case StatusSkip:
return 3
default:
return 4
}
}
func statusIcon(s Status) string {
switch s {
case StatusPass:
return "OK"
case StatusFail:
return "FAIL"
case StatusWarn:
return "WARN"
case StatusSkip:
return "SKIP"
default:
return "??"
}
}
func severityIcon(_ Severity) string {
return "##"
}

View File

@ -0,0 +1,135 @@
package inspector
import (
"bytes"
"encoding/json"
"strings"
"testing"
"time"
)
func TestPrintTable_EmptyResults(t *testing.T) {
r := &Results{}
var buf bytes.Buffer
PrintTable(r, &buf)
if !strings.Contains(buf.String(), "No checks executed") {
t.Errorf("expected 'No checks executed', got %q", buf.String())
}
}
func TestPrintTable_SortsFailuresFirst(t *testing.T) {
r := &Results{
Duration: time.Second,
Checks: []CheckResult{
{ID: "a", Name: "Pass check", Subsystem: "test", Status: StatusPass, Severity: Low},
{ID: "b", Name: "Fail check", Subsystem: "test", Status: StatusFail, Severity: Critical},
{ID: "c", Name: "Warn check", Subsystem: "test", Status: StatusWarn, Severity: High},
},
}
var buf bytes.Buffer
PrintTable(r, &buf)
output := buf.String()
// FAIL should appear before WARN, which should appear before OK
failIdx := strings.Index(output, "FAIL")
warnIdx := strings.Index(output, "WARN")
okIdx := strings.Index(output, "OK")
if failIdx < 0 || warnIdx < 0 || okIdx < 0 {
t.Fatalf("expected FAIL, WARN, and OK in output:\n%s", output)
}
if failIdx > warnIdx {
t.Errorf("FAIL (pos %d) should appear before WARN (pos %d)", failIdx, warnIdx)
}
if warnIdx > okIdx {
t.Errorf("WARN (pos %d) should appear before OK (pos %d)", warnIdx, okIdx)
}
}
func TestPrintTable_IncludesNode(t *testing.T) {
r := &Results{
Duration: time.Second,
Checks: []CheckResult{
{ID: "a", Name: "Check A", Subsystem: "test", Status: StatusPass, Node: "ubuntu@1.2.3.4"},
},
}
var buf bytes.Buffer
PrintTable(r, &buf)
if !strings.Contains(buf.String(), "ubuntu@1.2.3.4") {
t.Error("expected node name in table output")
}
}
func TestPrintTable_IncludesSummary(t *testing.T) {
r := &Results{
Duration: 2 * time.Second,
Checks: []CheckResult{
{ID: "a", Subsystem: "test", Status: StatusPass},
{ID: "b", Subsystem: "test", Status: StatusFail},
},
}
var buf bytes.Buffer
PrintTable(r, &buf)
output := buf.String()
if !strings.Contains(output, "1 passed") {
t.Error("summary should mention passed count")
}
if !strings.Contains(output, "1 failed") {
t.Error("summary should mention failed count")
}
}
func TestPrintJSON_ValidJSON(t *testing.T) {
r := &Results{
Duration: time.Second,
Checks: []CheckResult{
{ID: "a", Name: "A", Subsystem: "test", Status: StatusPass, Severity: Low, Message: "ok"},
{ID: "b", Name: "B", Subsystem: "test", Status: StatusFail, Severity: High, Message: "bad"},
},
}
var buf bytes.Buffer
PrintJSON(r, &buf)
var parsed map[string]interface{}
if err := json.Unmarshal(buf.Bytes(), &parsed); err != nil {
t.Fatalf("output is not valid JSON: %v\nraw: %s", err, buf.String())
}
summary, ok := parsed["summary"].(map[string]interface{})
if !ok {
t.Fatal("missing 'summary' object in JSON")
}
if v := summary["passed"]; v != float64(1) {
t.Errorf("summary.passed = %v, want 1", v)
}
if v := summary["failed"]; v != float64(1) {
t.Errorf("summary.failed = %v, want 1", v)
}
if v := summary["total"]; v != float64(2) {
t.Errorf("summary.total = %v, want 2", v)
}
checks, ok := parsed["checks"].([]interface{})
if !ok {
t.Fatal("missing 'checks' array in JSON")
}
if len(checks) != 2 {
t.Errorf("want 2 checks, got %d", len(checks))
}
}
func TestSummaryLine(t *testing.T) {
r := &Results{
Checks: []CheckResult{
{Status: StatusPass},
{Status: StatusPass},
{Status: StatusFail},
{Status: StatusWarn},
},
}
got := SummaryLine(r)
want := "2 passed, 1 failed, 1 warnings, 0 skipped"
if got != want {
t.Errorf("SummaryLine = %q, want %q", got, want)
}
}

165
pkg/inspector/ssh.go Normal file
View File

@ -0,0 +1,165 @@
package inspector
import (
"bytes"
"context"
"fmt"
"os/exec"
"strings"
"syscall"
"time"
)
const (
sshMaxRetries = 3
sshRetryDelay = 2 * time.Second
)
// SSHResult holds the output of an SSH command execution.
type SSHResult struct {
Stdout string
Stderr string
ExitCode int
Duration time.Duration
Err error
Retries int // how many retries were needed
}
// OK returns true if the command succeeded (exit code 0, no error).
func (r SSHResult) OK() bool {
return r.Err == nil && r.ExitCode == 0
}
// RunSSH executes a command on a remote node via SSH with retry on connection failure.
// Uses sshpass for password auth, falls back to -i for key-based auth.
// The -n flag is used to prevent SSH from reading stdin.
func RunSSH(ctx context.Context, node Node, command string) SSHResult {
var result SSHResult
for attempt := 0; attempt <= sshMaxRetries; attempt++ {
result = runSSHOnce(ctx, node, command)
result.Retries = attempt
// Success — return immediately
if result.OK() {
return result
}
// If the command ran but returned non-zero exit, that's the remote command
// failing (not a connection issue) — don't retry
if result.Err == nil && result.ExitCode != 0 {
return result
}
// Check if it's a connection-level failure worth retrying
if !isSSHConnectionError(result) {
return result
}
// Don't retry if context is done
if ctx.Err() != nil {
return result
}
// Wait before retry (except on last attempt)
if attempt < sshMaxRetries {
select {
case <-time.After(sshRetryDelay):
case <-ctx.Done():
return result
}
}
}
return result
}
// runSSHOnce executes a single SSH attempt.
func runSSHOnce(ctx context.Context, node Node, command string) SSHResult {
start := time.Now()
var args []string
if node.SSHKey != "" {
// Key-based auth
args = []string{
"ssh", "-n",
"-o", "StrictHostKeyChecking=no",
"-o", "ConnectTimeout=10",
"-o", "BatchMode=yes",
"-i", node.SSHKey,
fmt.Sprintf("%s@%s", node.User, node.Host),
command,
}
} else {
// Password auth via sshpass
args = []string{
"sshpass", "-p", node.Password,
"ssh", "-n",
"-o", "StrictHostKeyChecking=no",
"-o", "ConnectTimeout=10",
fmt.Sprintf("%s@%s", node.User, node.Host),
command,
}
}
cmd := exec.CommandContext(ctx, args[0], args[1:]...)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
err := cmd.Run()
duration := time.Since(start)
exitCode := 0
if err != nil {
if exitErr, ok := err.(*exec.ExitError); ok {
if status, ok := exitErr.Sys().(syscall.WaitStatus); ok {
exitCode = status.ExitStatus()
}
}
}
return SSHResult{
Stdout: strings.TrimSpace(stdout.String()),
Stderr: strings.TrimSpace(stderr.String()),
ExitCode: exitCode,
Duration: duration,
Err: err,
}
}
// isSSHConnectionError returns true if the failure looks like an SSH connection
// problem (timeout, refused, network unreachable) rather than a remote command error.
func isSSHConnectionError(r SSHResult) bool {
// sshpass exit code 5 = invalid/incorrect password (not retriable)
// sshpass exit code 6 = host key verification failed (not retriable)
// SSH exit code 255 = SSH connection error (retriable)
if r.ExitCode == 255 {
return true
}
stderr := strings.ToLower(r.Stderr)
connectionErrors := []string{
"connection refused",
"connection timed out",
"connection reset",
"no route to host",
"network is unreachable",
"could not resolve hostname",
"ssh_exchange_identification",
"broken pipe",
"connection closed by remote host",
}
for _, pattern := range connectionErrors {
if strings.Contains(stderr, pattern) {
return true
}
}
return false
}
// RunSSHMulti executes a multi-command string on a remote node.
// Commands are joined with " && " so failure stops execution.
func RunSSHMulti(ctx context.Context, node Node, commands []string) SSHResult {
combined := strings.Join(commands, " && ")
return RunSSH(ctx, node, combined)
}

View File

@ -61,7 +61,9 @@ func (cm *ClusterConfigManager) UpdateAllClusterPeers() error {
func (cm *ClusterConfigManager) RepairPeerConfiguration() error {
cm.logger.Info("Attempting to repair IPFS Cluster peer configuration")
_ = cm.FixIPFSConfigAddresses()
if err := cm.FixIPFSConfigAddresses(); err != nil {
cm.logger.Warn("Failed to fix IPFS config addresses during repair", zap.Error(err))
}
peers, err := cm.DiscoverClusterPeersFromGateway()
if err != nil {
@ -72,7 +74,9 @@ func (cm *ClusterConfigManager) RepairPeerConfiguration() error {
peerAddrs = append(peerAddrs, p.Multiaddress)
}
if len(peerAddrs) > 0 {
_ = cm.UpdatePeerAddresses(peerAddrs)
if err := cm.UpdatePeerAddresses(peerAddrs); err != nil {
cm.logger.Warn("Failed to update peer addresses during repair", zap.Error(err))
}
}
}

View File

@ -77,19 +77,6 @@ func parseIPFSPort(rawURL string) (int, error) {
return port, nil
}
func parsePeerHostAndPort(multiaddr string) (string, int) {
parts := strings.Split(multiaddr, "/")
var hostStr string
var port int
for i, part := range parts {
if part == "ip4" || part == "dns" || part == "dns4" {
hostStr = parts[i+1]
} else if part == "tcp" {
fmt.Sscanf(parts[i+1], "%d", &port)
}
}
return hostStr, port
}
func extractIPFromMultiaddrForCluster(maddr string) string {
parts := strings.Split(maddr, "/")

View File

@ -893,21 +893,35 @@ func (cm *ClusterManager) GetClusterStatus(ctx context.Context, clusterID string
ClusterID: cluster.ID,
}
// Check individual service status
// TODO: Actually check each service's health
if cluster.Status == ClusterStatusReady {
status.RQLiteReady = true
status.OlricReady = true
status.GatewayReady = true
status.DNSReady = true
}
// Get node list
// Check individual service status by inspecting cluster nodes
nodes, err := cm.getClusterNodes(ctx, clusterID)
if err == nil {
runningCount := 0
hasRQLite := false
hasOlric := false
hasGateway := false
for _, node := range nodes {
status.Nodes = append(status.Nodes, node.NodeID)
if node.Status == NodeStatusRunning {
runningCount++
}
if node.RQLiteHTTPPort > 0 {
hasRQLite = true
}
if node.OlricHTTPPort > 0 {
hasOlric = true
}
if node.GatewayHTTPPort > 0 {
hasGateway = true
}
}
allRunning := len(nodes) > 0 && runningCount == len(nodes)
status.RQLiteReady = allRunning && hasRQLite
status.OlricReady = allRunning && hasOlric
status.GatewayReady = allRunning && hasGateway
status.DNSReady = allRunning
}
if cluster.ErrorMessage != "" {

View File

@ -6,6 +6,7 @@ import (
"time"
"github.com/DeBrosOfficial/network/pkg/client"
"github.com/DeBrosOfficial/network/pkg/constants"
"github.com/DeBrosOfficial/network/pkg/rqlite"
"go.uber.org/zap"
)
@ -176,12 +177,10 @@ func (cns *ClusterNodeSelector) getNodeCapacity(ctx context.Context, nodeID, ipA
}
// Calculate available capacity
const (
maxDeployments = 100
maxPorts = 9900 // User deployment port range
maxMemoryMB = 8192 // 8GB
maxCPUPercent = 400 // 4 cores
)
maxDeployments := constants.MaxDeploymentsPerNode
maxPorts := constants.MaxPortsPerNode
maxMemoryMB := constants.MaxMemoryMB
maxCPUPercent := constants.MaxCPUPercent
availablePorts := maxPorts - allocatedPorts
if availablePorts < 0 {
@ -363,23 +362,3 @@ func (cns *ClusterNodeSelector) calculateCapacityScore(
return totalScore
}
// GetNodeByID retrieves a node's information by ID
func (cns *ClusterNodeSelector) GetNodeByID(ctx context.Context, nodeID string) (*nodeInfo, error) {
internalCtx := client.WithInternalAuth(ctx)
var results []nodeInfo
query := `SELECT id, ip_address, COALESCE(internal_ip, ip_address) as internal_ip FROM dns_nodes WHERE id = ? LIMIT 1`
err := cns.db.Query(internalCtx, &results, query, nodeID)
if err != nil {
return nil, &ClusterError{
Message: "failed to query node",
Cause: err,
}
}
if len(results) == 0 {
return nil, nil
}
return &results[0], nil
}

View File

@ -3,6 +3,7 @@ package namespace
import (
"context"
"fmt"
"strings"
"time"
"github.com/DeBrosOfficial/network/pkg/client"
@ -369,19 +370,5 @@ func isConflictError(err error) bool {
return false
}
errStr := err.Error()
return contains(errStr, "UNIQUE") || contains(errStr, "constraint") || contains(errStr, "conflict")
}
// contains checks if a string contains a substring (case-insensitive)
func contains(s, substr string) bool {
return len(s) >= len(substr) && (s == substr || len(s) > len(substr) && findSubstring(s, substr))
}
func findSubstring(s, substr string) bool {
for i := 0; i <= len(s)-len(substr); i++ {
if s[i:i+len(substr)] == substr {
return true
}
}
return false
return strings.Contains(errStr, "UNIQUE") || strings.Contains(errStr, "constraint") || strings.Contains(errStr, "conflict")
}

View File

@ -4,6 +4,7 @@ import (
"context"
"database/sql"
"errors"
"strings"
"testing"
"time"
@ -269,7 +270,7 @@ func TestContains(t *testing.T) {
for _, tt := range tests {
t.Run(tt.s+"_"+tt.substr, func(t *testing.T) {
result := contains(tt.s, tt.substr)
result := strings.Contains(tt.s, tt.substr)
if result != tt.expected {
t.Errorf("contains(%q, %q) = %v, want %v", tt.s, tt.substr, result, tt.expected)
}

View File

@ -1,25 +1,9 @@
package namespace
import (
"fmt"
"net"
)
import "github.com/DeBrosOfficial/network/pkg/wireguard"
// getWireGuardIP returns the IPv4 address of the wg0 interface.
// Used as a fallback when Olric BindAddr is empty or 0.0.0.0.
func getWireGuardIP() (string, error) {
iface, err := net.InterfaceByName("wg0")
if err != nil {
return "", fmt.Errorf("wg0 interface not found: %w", err)
}
addrs, err := iface.Addrs()
if err != nil {
return "", fmt.Errorf("failed to get wg0 addresses: %w", err)
}
for _, addr := range addrs {
if ipnet, ok := addr.(*net.IPNet); ok && ipnet.IP.To4() != nil {
return ipnet.IP.String(), nil
}
}
return "", fmt.Errorf("no IPv4 address on wg0")
return wireguard.GetIP()
}

View File

@ -11,6 +11,7 @@ import (
"time"
"github.com/DeBrosOfficial/network/pkg/logging"
"github.com/DeBrosOfficial/network/pkg/wireguard"
"go.uber.org/zap"
)
@ -414,20 +415,7 @@ func (n *Node) isNameserverNode(ctx context.Context) bool {
// getWireGuardIP returns the IPv4 address assigned to the wg0 interface, if any
func (n *Node) getWireGuardIP() (string, error) {
iface, err := net.InterfaceByName("wg0")
if err != nil {
return "", err
}
addrs, err := iface.Addrs()
if err != nil {
return "", err
}
for _, addr := range addrs {
if ipnet, ok := addr.(*net.IPNet); ok && ipnet.IP.To4() != nil {
return ipnet.IP.String(), nil
}
}
return "", fmt.Errorf("no IPv4 address on wg0")
return wireguard.GetIP()
}
// getNodeIPAddress attempts to determine the node's external IP address

View File

@ -47,7 +47,9 @@ func (r *RQLiteManager) waitForMinClusterSizeBeforeStart(ctx context.Context, rq
return nil
}
_ = r.discoveryService.TriggerPeerExchange(ctx)
if err := r.discoveryService.TriggerPeerExchange(ctx); err != nil {
r.logger.Warn("Failed to trigger peer exchange before cluster wait", zap.Error(err))
}
checkInterval := 2 * time.Second
for {
@ -92,7 +94,9 @@ func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rql
return fmt.Errorf("discovery service not available")
}
_ = r.discoveryService.TriggerPeerExchange(ctx)
if err := r.discoveryService.TriggerPeerExchange(ctx); err != nil {
r.logger.Warn("Failed to trigger peer exchange during pre-start discovery", zap.Error(err))
}
time.Sleep(1 * time.Second)
r.discoveryService.TriggerSync()
time.Sleep(2 * time.Second)
@ -123,7 +127,9 @@ func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rql
zap.Int("discovered_peers", discoveredPeers),
zap.Int("min_cluster_size", r.config.MinClusterSize))
// Still write peers.json with just ourselves - better than nothing
_ = r.discoveryService.ForceWritePeersJSON()
if err := r.discoveryService.ForceWritePeersJSON(); err != nil {
r.logger.Warn("Failed to write single-node peers.json fallback", zap.Error(err))
}
return nil
}
@ -137,8 +143,12 @@ func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rql
}
if ourLogIndex == 0 && maxPeerIndex > 0 {
_ = r.clearRaftState(rqliteDataDir)
_ = r.discoveryService.ForceWritePeersJSON()
if err := r.clearRaftState(rqliteDataDir); err != nil {
r.logger.Warn("Failed to clear raft state during pre-start discovery", zap.Error(err))
}
if err := r.discoveryService.ForceWritePeersJSON(); err != nil {
r.logger.Warn("Failed to write peers.json after clearing raft state", zap.Error(err))
}
}
}
@ -150,7 +160,9 @@ func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rql
// recoverCluster restarts RQLite using peers.json
func (r *RQLiteManager) recoverCluster(ctx context.Context, peersJSONPath string) error {
_ = r.Stop()
if err := r.Stop(); err != nil {
r.logger.Warn("Failed to stop RQLite during cluster recovery", zap.Error(err))
}
time.Sleep(2 * time.Second)
rqliteDataDir, err := r.rqliteDataDirPath()
@ -187,10 +199,14 @@ func (r *RQLiteManager) recoverFromSplitBrain(ctx context.Context) error {
}
if ourIndex == 0 && maxPeerIndex > 0 {
_ = r.clearRaftState(rqliteDataDir)
if err := r.clearRaftState(rqliteDataDir); err != nil {
r.logger.Warn("Failed to clear raft state during split-brain recovery", zap.Error(err))
}
r.discoveryService.TriggerPeerExchange(ctx)
time.Sleep(1 * time.Second)
_ = r.discoveryService.ForceWritePeersJSON()
if err := r.discoveryService.ForceWritePeersJSON(); err != nil {
r.logger.Warn("Failed to write peers.json during split-brain recovery", zap.Error(err))
}
return r.recoverCluster(ctx, filepath.Join(rqliteDataDir, "raft", "peers.json"))
}
@ -265,7 +281,9 @@ func (r *RQLiteManager) startHealthMonitoring(ctx context.Context) {
return
case <-ticker.C:
if r.isInSplitBrainState() {
_ = r.recoverFromSplitBrain(ctx)
if err := r.recoverFromSplitBrain(ctx); err != nil {
r.logger.Warn("Split-brain recovery attempt failed", zap.Error(err))
}
}
}
}

View File

@ -3,14 +3,21 @@ package cache
import (
"context"
"sync"
"time"
"github.com/tetratelabs/wazero"
"go.uber.org/zap"
)
// cacheEntry wraps a compiled module with access tracking for LRU eviction.
type cacheEntry struct {
module wazero.CompiledModule
lastAccessed time.Time
}
// ModuleCache manages compiled WASM module caching.
type ModuleCache struct {
modules map[string]wazero.CompiledModule
modules map[string]*cacheEntry
mu sync.RWMutex
capacity int
logger *zap.Logger
@ -19,7 +26,7 @@ type ModuleCache struct {
// NewModuleCache creates a new ModuleCache.
func NewModuleCache(capacity int, logger *zap.Logger) *ModuleCache {
return &ModuleCache{
modules: make(map[string]wazero.CompiledModule),
modules: make(map[string]*cacheEntry),
capacity: capacity,
logger: logger,
}
@ -27,15 +34,20 @@ func NewModuleCache(capacity int, logger *zap.Logger) *ModuleCache {
// Get retrieves a compiled module from the cache.
func (c *ModuleCache) Get(wasmCID string) (wazero.CompiledModule, bool) {
c.mu.RLock()
defer c.mu.RUnlock()
c.mu.Lock()
defer c.mu.Unlock()
module, exists := c.modules[wasmCID]
return module, exists
entry, exists := c.modules[wasmCID]
if !exists {
return nil, false
}
entry.lastAccessed = time.Now()
return entry.module, true
}
// Set stores a compiled module in the cache.
// If the cache is full, it evicts the oldest module.
// If the cache is full, it evicts the least recently used module.
func (c *ModuleCache) Set(wasmCID string, module wazero.CompiledModule) {
c.mu.Lock()
defer c.mu.Unlock()
@ -50,7 +62,10 @@ func (c *ModuleCache) Set(wasmCID string, module wazero.CompiledModule) {
c.evictOldest()
}
c.modules[wasmCID] = module
c.modules[wasmCID] = &cacheEntry{
module: module,
lastAccessed: time.Now(),
}
c.logger.Debug("Module cached",
zap.String("wasm_cid", wasmCID),
@ -63,8 +78,8 @@ func (c *ModuleCache) Delete(ctx context.Context, wasmCID string) {
c.mu.Lock()
defer c.mu.Unlock()
if module, exists := c.modules[wasmCID]; exists {
_ = module.Close(ctx)
if entry, exists := c.modules[wasmCID]; exists {
_ = entry.module.Close(ctx)
delete(c.modules, wasmCID)
c.logger.Debug("Module removed from cache", zap.String("wasm_cid", wasmCID))
}
@ -97,8 +112,8 @@ func (c *ModuleCache) Clear(ctx context.Context) {
c.mu.Lock()
defer c.mu.Unlock()
for cid, module := range c.modules {
if err := module.Close(ctx); err != nil {
for cid, entry := range c.modules {
if err := entry.module.Close(ctx); err != nil {
c.logger.Warn("Failed to close cached module during clear",
zap.String("cid", cid),
zap.Error(err),
@ -106,7 +121,7 @@ func (c *ModuleCache) Clear(ctx context.Context) {
}
}
c.modules = make(map[string]wazero.CompiledModule)
c.modules = make(map[string]*cacheEntry)
c.logger.Debug("Module cache cleared")
}
@ -118,16 +133,23 @@ func (c *ModuleCache) GetStats() (size int, capacity int) {
return len(c.modules), c.capacity
}
// evictOldest removes the oldest module from cache.
// evictOldest removes the least recently accessed module from cache.
// Must be called with mu held.
func (c *ModuleCache) evictOldest() {
// Simple LRU: just remove the first one we find
// In production, you'd want proper LRU tracking
for cid, module := range c.modules {
_ = module.Close(context.Background())
delete(c.modules, cid)
c.logger.Debug("Evicted module from cache", zap.String("wasm_cid", cid))
break
var oldestCID string
var oldestTime time.Time
for cid, entry := range c.modules {
if oldestCID == "" || entry.lastAccessed.Before(oldestTime) {
oldestCID = cid
oldestTime = entry.lastAccessed
}
}
if oldestCID != "" {
_ = c.modules[oldestCID].module.Close(context.Background())
delete(c.modules, oldestCID)
c.logger.Debug("Evicted LRU module from cache", zap.String("wasm_cid", oldestCID))
}
}
@ -135,12 +157,13 @@ func (c *ModuleCache) evictOldest() {
// The compute function is called with the lock released to avoid blocking.
func (c *ModuleCache) GetOrCompute(wasmCID string, compute func() (wazero.CompiledModule, error)) (wazero.CompiledModule, error) {
// Try to get from cache first
c.mu.RLock()
if module, exists := c.modules[wasmCID]; exists {
c.mu.RUnlock()
return module, nil
c.mu.Lock()
if entry, exists := c.modules[wasmCID]; exists {
entry.lastAccessed = time.Now()
c.mu.Unlock()
return entry.module, nil
}
c.mu.RUnlock()
c.mu.Unlock()
// Compute the module (without holding the lock)
module, err := compute()
@ -153,9 +176,10 @@ func (c *ModuleCache) GetOrCompute(wasmCID string, compute func() (wazero.Compil
defer c.mu.Unlock()
// Double-check (another goroutine might have added it)
if existingModule, exists := c.modules[wasmCID]; exists {
if entry, exists := c.modules[wasmCID]; exists {
_ = module.Close(context.Background()) // Discard our compilation
return existingModule, nil
entry.lastAccessed = time.Now()
return entry.module, nil
}
// Evict if cache is full
@ -163,7 +187,10 @@ func (c *ModuleCache) GetOrCompute(wasmCID string, compute func() (wazero.Compil
c.evictOldest()
}
c.modules[wasmCID] = module
c.modules[wasmCID] = &cacheEntry{
module: module,
lastAccessed: time.Now(),
}
c.logger.Debug("Module compiled and cached",
zap.String("wasm_cid", wasmCID),

View File

@ -81,36 +81,3 @@ func (m *ModuleLifecycle) ValidateModule(module wazero.CompiledModule) error {
return nil
}
// InstantiateModule creates a module instance for execution.
// Note: This method is currently unused but kept for potential future use.
func (m *ModuleLifecycle) InstantiateModule(ctx context.Context, compiled wazero.CompiledModule, config wazero.ModuleConfig) error {
if compiled == nil {
return fmt.Errorf("compiled module is nil")
}
instance, err := m.runtime.InstantiateModule(ctx, compiled, config)
if err != nil {
return fmt.Errorf("failed to instantiate module: %w", err)
}
// Close immediately - this is just for validation
_ = instance.Close(ctx)
return nil
}
// ModuleInfo provides information about a compiled module.
type ModuleInfo struct {
CID string
SizeBytes int
Compiled bool
}
// GetModuleInfo returns information about a module.
func (m *ModuleLifecycle) GetModuleInfo(wasmCID string, wasmBytes []byte, isCompiled bool) *ModuleInfo {
return &ModuleInfo{
CID: wasmCID,
SizeBytes: len(wasmBytes),
Compiled: isCompiled,
}
}

View File

@ -3,6 +3,7 @@ package serverless
import (
"context"
"encoding/json"
"errors"
"fmt"
"time"
@ -249,7 +250,7 @@ func (i *Invoker) isRetryable(err error) bool {
// Retry execution errors (could be transient)
var execErr *ExecutionError
if ok := errorAs(err, &execErr); ok {
if errors.As(err, &execErr) {
return true
}
@ -347,22 +348,6 @@ type DLQMessage struct {
CallerWallet string `json:"caller_wallet,omitempty"`
}
// errorAs is a helper to avoid import of errors package.
func errorAs(err error, target interface{}) bool {
if err == nil {
return false
}
// Simple type assertion for our custom error types
switch t := target.(type) {
case **ExecutionError:
if e, ok := err.(*ExecutionError); ok {
*t = e
return true
}
}
return false
}
// -----------------------------------------------------------------------------
// Batch Invocation (for future use)
// -----------------------------------------------------------------------------

View File

@ -438,27 +438,6 @@ func (r *Registry) uploadWASM(ctx context.Context, wasmBytes []byte, name string
return resp.Cid, nil
}
// getLatestVersion returns the latest version number for a function.
func (r *Registry) getLatestVersion(ctx context.Context, namespace, name string) (int, error) {
query := `SELECT MAX(version) FROM functions WHERE namespace = ? AND name = ?`
var maxVersion sql.NullInt64
var results []struct {
MaxVersion sql.NullInt64 `db:"max(version)"`
}
if err := r.db.Query(ctx, &results, query, namespace, name); err != nil {
return 0, err
}
if len(results) == 0 || !results[0].MaxVersion.Valid {
return 0, ErrFunctionNotFound
}
maxVersion = results[0].MaxVersion
return int(maxVersion.Int64), nil
}
// getByNameInternal retrieves a function by name regardless of status.
func (r *Registry) getByNameInternal(ctx context.Context, namespace, name string) (*Function, error) {
namespace = strings.TrimSpace(namespace)

View File

@ -82,12 +82,9 @@ func GetTLSConfig() *tls.Config {
MinVersion: tls.VersionTLS12,
}
// If we have a CA cert pool, use it
// If we have a CA cert pool, use it for verifying self-signed certs
if caCertPool != nil {
config.RootCAs = caCertPool
} else if len(trustedDomains) > 0 {
// Fallback: skip verification if trusted domains are configured but no CA pool
config.InsecureSkipVerify = true
}
return config
@ -103,11 +100,12 @@ func NewHTTPClient(timeout time.Duration) *http.Client {
}
}
// NewHTTPClientForDomain creates an HTTP client configured for a specific domain
// NewHTTPClientForDomain creates an HTTP client configured for a specific domain.
// Only skips TLS verification for explicitly trusted domains when no CA cert is available.
func NewHTTPClientForDomain(timeout time.Duration, hostname string) *http.Client {
tlsConfig := GetTLSConfig()
// If this domain is in trusted list and we don't have a CA pool, allow insecure
// Only skip TLS for explicitly trusted domains when no CA pool is configured
if caCertPool == nil && ShouldSkipTLSVerify(hostname) {
tlsConfig.InsecureSkipVerify = true
}

24
pkg/wireguard/ip.go Normal file
View File

@ -0,0 +1,24 @@
package wireguard
import (
"fmt"
"net"
)
// GetIP returns the IPv4 address of the wg0 interface.
func GetIP() (string, error) {
iface, err := net.InterfaceByName("wg0")
if err != nil {
return "", fmt.Errorf("wg0 interface not found: %w", err)
}
addrs, err := iface.Addrs()
if err != nil {
return "", fmt.Errorf("failed to get wg0 addresses: %w", err)
}
for _, addr := range addrs {
if ipnet, ok := addr.(*net.IPNet); ok && ipnet.IP.To4() != nil {
return ipnet.IP.String(), nil
}
}
return "", fmt.Errorf("no IPv4 address on wg0")
}

View File

@ -1,298 +0,0 @@
#!/usr/bin/env bash
# block-node.sh - Temporarily block network access to a gateway node (local or remote)
# Usage:
# Local: ./scripts/block-node.sh <node_number> <duration_seconds>
# Remote: ./scripts/block-node.sh --remote <remote_node_number> <duration_seconds>
# Example:
# ./scripts/block-node.sh 1 60 # Block local node-1 (port 6001) for 60 seconds
# ./scripts/block-node.sh --remote 2 120 # Block remote node-2 for 120 seconds
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Remote node configurations - loaded from config file
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
CONFIG_FILE="$SCRIPT_DIR/remote-nodes.conf"
# Function to get remote node config
get_remote_node_config() {
local node_num="$1"
local field="$2" # "user_host" or "password"
if [ ! -f "$CONFIG_FILE" ]; then
echo ""
return 1
fi
while IFS='|' read -r num user_host password || [ -n "$num" ]; do
# Skip comments and empty lines
[[ "$num" =~ ^#.*$ ]] || [[ -z "$num" ]] && continue
# Trim whitespace
num=$(echo "$num" | xargs)
user_host=$(echo "$user_host" | xargs)
password=$(echo "$password" | xargs)
if [ "$num" = "$node_num" ]; then
if [ "$field" = "user_host" ]; then
echo "$user_host"
elif [ "$field" = "password" ]; then
echo "$password"
fi
return 0
fi
done < "$CONFIG_FILE"
echo ""
return 1
}
# Display usage
usage() {
echo -e "${RED}Error:${NC} Invalid arguments"
echo ""
echo -e "${BLUE}Usage:${NC}"
echo " $0 <node_number> <duration_seconds> # Local mode"
echo " $0 --remote <remote_node_number> <duration_seconds> # Remote mode"
echo ""
echo -e "${GREEN}Local Mode Examples:${NC}"
echo " $0 1 60 # Block local node-1 (port 6001) for 60 seconds"
echo " $0 2 120 # Block local node-2 (port 6002) for 120 seconds"
echo ""
echo -e "${GREEN}Remote Mode Examples:${NC}"
echo " $0 --remote 1 60 # Block remote node-1 (51.83.128.181) for 60 seconds"
echo " $0 --remote 3 120 # Block remote node-3 (83.171.248.66) for 120 seconds"
echo ""
echo -e "${YELLOW}Local Node Mapping:${NC}"
echo " Node 1 -> Port 6001"
echo " Node 2 -> Port 6002"
echo " Node 3 -> Port 6003"
echo " Node 4 -> Port 6004"
echo " Node 5 -> Port 6005"
echo ""
echo -e "${YELLOW}Remote Node Mapping:${NC}"
echo " Remote 1 -> ubuntu@51.83.128.181"
echo " Remote 2 -> root@194.61.28.7"
echo " Remote 3 -> root@83.171.248.66"
echo " Remote 4 -> root@62.72.44.87"
exit 1
}
# Parse arguments
REMOTE_MODE=false
if [ $# -eq 3 ] && [ "$1" == "--remote" ]; then
REMOTE_MODE=true
NODE_NUM="$2"
DURATION="$3"
elif [ $# -eq 2 ]; then
NODE_NUM="$1"
DURATION="$2"
else
usage
fi
# Validate duration
if ! [[ "$DURATION" =~ ^[0-9]+$ ]] || [ "$DURATION" -le 0 ]; then
echo -e "${RED}Error:${NC} Duration must be a positive integer"
exit 1
fi
# Calculate port (local nodes use 6001-6005, remote nodes use 80 and 443)
if [ "$REMOTE_MODE" = true ]; then
# Remote nodes: block standard HTTP/HTTPS ports
PORTS="80 443"
else
# Local nodes: block the specific gateway port
PORT=$((6000 + NODE_NUM))
fi
# Function to block ports on remote server
block_remote_node() {
local node_num="$1"
local duration="$2"
local ports="$3" # Can be space-separated list like "80 443"
# Validate remote node number
if ! [[ "$node_num" =~ ^[1-4]$ ]]; then
echo -e "${RED}Error:${NC} Remote node number must be between 1 and 4"
exit 1
fi
# Get credentials from config file
local user_host=$(get_remote_node_config "$node_num" "user_host")
local password=$(get_remote_node_config "$node_num" "password")
if [ -z "$user_host" ] || [ -z "$password" ]; then
echo -e "${RED}Error:${NC} Configuration for remote node $node_num not found in $CONFIG_FILE"
exit 1
fi
local host="${user_host##*@}"
echo -e "${BLUE}=== Remote Network Blocking Tool ===${NC}"
echo -e "Remote Node: ${GREEN}$node_num${NC} ($user_host)"
echo -e "Ports: ${GREEN}$ports${NC}"
echo -e "Duration: ${GREEN}$duration seconds${NC}"
echo ""
# Check if sshpass is installed
if ! command -v sshpass &> /dev/null; then
echo -e "${RED}Error:${NC} sshpass is not installed. Install it first:"
echo -e " ${YELLOW}macOS:${NC} brew install hudochenkov/sshpass/sshpass"
echo -e " ${YELLOW}Ubuntu/Debian:${NC} sudo apt-get install sshpass"
exit 1
fi
# SSH options - force password authentication only to avoid "too many auth failures"
SSH_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR -o PreferredAuthentications=password -o PubkeyAuthentication=no -o NumberOfPasswordPrompts=1"
echo -e "${YELLOW}Connecting to remote server...${NC}"
# Test connection
if ! sshpass -p "$password" ssh $SSH_OPTS "$user_host" "echo 'Connected successfully' > /dev/null"; then
echo -e "${RED}Error:${NC} Failed to connect to $user_host"
exit 1
fi
echo -e "${GREEN}${NC} Connected to $host"
# Install iptables rules on remote server
echo -e "${YELLOW}Installing iptables rules on remote server...${NC}"
# Build iptables commands for all ports
BLOCK_CMDS=""
for port in $ports; do
BLOCK_CMDS="${BLOCK_CMDS}iptables -I INPUT -p tcp --dport $port -j DROP 2>/dev/null || true; "
BLOCK_CMDS="${BLOCK_CMDS}iptables -I OUTPUT -p tcp --sport $port -j DROP 2>/dev/null || true; "
done
BLOCK_CMDS="${BLOCK_CMDS}echo 'Rules installed'"
if ! sshpass -p "$password" ssh $SSH_OPTS "$user_host" "$BLOCK_CMDS"; then
echo -e "${RED}Error:${NC} Failed to install iptables rules"
exit 1
fi
echo -e "${GREEN}${NC} Ports $ports are now blocked on $host"
echo -e "${YELLOW}Waiting $duration seconds...${NC}"
echo ""
# Show countdown
for ((i=duration; i>0; i--)); do
printf "\r${BLUE}Time remaining: %3d seconds${NC}" "$i"
sleep 1
done
echo ""
echo ""
echo -e "${YELLOW}Removing iptables rules from remote server...${NC}"
# Build iptables removal commands for all ports
UNBLOCK_CMDS=""
for port in $ports; do
UNBLOCK_CMDS="${UNBLOCK_CMDS}iptables -D INPUT -p tcp --dport $port -j DROP 2>/dev/null || true; "
UNBLOCK_CMDS="${UNBLOCK_CMDS}iptables -D OUTPUT -p tcp --sport $port -j DROP 2>/dev/null || true; "
done
UNBLOCK_CMDS="${UNBLOCK_CMDS}echo 'Rules removed'"
if ! sshpass -p "$password" ssh $SSH_OPTS "$user_host" "$UNBLOCK_CMDS"; then
echo -e "${YELLOW}Warning:${NC} Failed to remove some iptables rules. You may need to clean up manually."
else
echo -e "${GREEN}${NC} Ports $ports are now accessible again on $host"
fi
echo ""
echo -e "${GREEN}=== Done! ===${NC}"
echo -e "Remote node ${GREEN}$node_num${NC} ($host) was unreachable for $duration seconds and is now accessible again."
}
# Function to block port locally using process pause (SIGSTOP)
block_local_node() {
local node_num="$1"
local duration="$2"
local port="$3"
# Validate node number
if ! [[ "$node_num" =~ ^[1-5]$ ]]; then
echo -e "${RED}Error:${NC} Local node number must be between 1 and 5"
exit 1
fi
echo -e "${BLUE}=== Local Network Blocking Tool ===${NC}"
echo -e "Node: ${GREEN}node-$node_num${NC}"
echo -e "Port: ${GREEN}$port${NC}"
echo -e "Duration: ${GREEN}$duration seconds${NC}"
echo -e "Method: ${GREEN}Process Pause (SIGSTOP/SIGCONT)${NC}"
echo ""
# Find the process listening on the port
echo -e "${YELLOW}Finding process listening on port $port...${NC}"
# macOS uses different tools than Linux
if [[ "$(uname -s)" == "Darwin" ]]; then
# macOS: use lsof
PID=$(lsof -ti :$port 2>/dev/null | head -1 || echo "")
else
# Linux: use ss or netstat
if command -v ss &> /dev/null; then
PID=$(ss -tlnp | grep ":$port " | grep -oP 'pid=\K[0-9]+' | head -1 || echo "")
else
PID=$(netstat -tlnp 2>/dev/null | grep ":$port " | awk '{print $7}' | cut -d'/' -f1 | head -1 || echo "")
fi
fi
if [ -z "$PID" ]; then
echo -e "${RED}Error:${NC} No process found listening on port $port"
echo -e "Make sure node-$node_num is running first."
exit 1
fi
# Get process name
PROCESS_NAME=$(ps -p $PID -o comm= 2>/dev/null || echo "unknown")
echo -e "${GREEN}${NC} Found process: ${BLUE}$PROCESS_NAME${NC} (PID: ${BLUE}$PID${NC})"
echo ""
# Pause the process
echo -e "${YELLOW}Pausing process (SIGSTOP)...${NC}"
if ! kill -STOP $PID 2>/dev/null; then
echo -e "${RED}Error:${NC} Failed to pause process. You may need sudo privileges."
exit 1
fi
echo -e "${GREEN}${NC} Process paused - node-$node_num is now unreachable"
echo -e "${YELLOW}Waiting $duration seconds...${NC}"
echo ""
# Show countdown
for ((i=duration; i>0; i--)); do
printf "\r${BLUE}Time remaining: %3d seconds${NC}" "$i"
sleep 1
done
echo ""
echo ""
# Resume the process
echo -e "${YELLOW}Resuming process (SIGCONT)...${NC}"
if ! kill -CONT $PID 2>/dev/null; then
echo -e "${YELLOW}Warning:${NC} Failed to resume process. It may have been terminated."
else
echo -e "${GREEN}${NC} Process resumed - node-$node_num is now accessible again"
fi
echo ""
echo -e "${GREEN}=== Done! ===${NC}"
echo -e "Local node ${GREEN}node-$node_num${NC} was unreachable for $duration seconds and is now accessible again."
}
# Main execution
if [ "$REMOTE_MODE" = true ]; then
block_remote_node "$NODE_NUM" "$DURATION" "$PORTS"
else
block_local_node "$NODE_NUM" "$DURATION" "$PORT"
fi

View File

@ -1,112 +0,0 @@
#!/bin/bash
set -e
# Build custom CoreDNS binary with RQLite plugin
# This script compiles CoreDNS with the custom RQLite plugin
COREDNS_VERSION="1.11.1"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
COREDNS_DIR="/tmp/coredns-build"
echo "Building CoreDNS v${COREDNS_VERSION} with RQLite plugin..."
# Clean previous build
rm -rf "$COREDNS_DIR"
mkdir -p "$COREDNS_DIR"
# Clone CoreDNS
echo "Cloning CoreDNS..."
cd "$COREDNS_DIR"
git clone --depth 1 --branch v${COREDNS_VERSION} https://github.com/coredns/coredns.git
cd coredns
# Create plugin.cfg with RQLite plugin
echo "Configuring plugins..."
cat > plugin.cfg <<EOF
# Standard CoreDNS plugins
metadata:metadata
cancel:cancel
tls:tls
reload:reload
nsid:nsid
bufsize:bufsize
root:root
bind:bind
debug:debug
trace:trace
ready:ready
health:health
pprof:pprof
prometheus:metrics
errors:errors
log:log
dnstap:dnstap
local:local
dns64:dns64
acl:acl
any:any
chaos:chaos
loadbalance:loadbalance
cache:cache
rewrite:rewrite
header:header
dnssec:dnssec
autopath:autopath
minimal:minimal
template:template
transfer:transfer
hosts:hosts
route53:route53
azure:azure
clouddns:clouddns
k8s_external:k8s_external
kubernetes:kubernetes
file:file
auto:auto
secondary:secondary
loop:loop
forward:forward
grpc:grpc
erratic:erratic
whoami:whoami
on:github.com/coredns/caddy/onevent
sign:sign
view:view
# Response Rate Limiting (DNS amplification protection)
rrl:rrl
# Custom RQLite plugin
rqlite:github.com/DeBrosOfficial/network/pkg/coredns/rqlite
EOF
# Copy RQLite plugin to CoreDNS
echo "Copying RQLite plugin..."
mkdir -p plugin/rqlite
cp -r "$PROJECT_ROOT/pkg/coredns/rqlite/"* plugin/rqlite/
# Update go.mod to include our dependencies
echo "Updating dependencies..."
go get github.com/rqlite/rqlite-go@latest
go get github.com/coredns/coredns@v${COREDNS_VERSION}
go mod tidy
# Build CoreDNS
echo "Building CoreDNS binary..."
make
# Copy binary to project
echo "Copying binary to project..."
cp coredns "$PROJECT_ROOT/bin/coredns-custom"
chmod +x "$PROJECT_ROOT/bin/coredns-custom"
echo ""
echo "✅ CoreDNS built successfully!"
echo "Binary location: $PROJECT_ROOT/bin/coredns-custom"
echo ""
echo "To deploy:"
echo " 1. Copy binary to /usr/local/bin/coredns on each nameserver node"
echo " 2. Copy configs/coredns/Corefile to /etc/coredns/Corefile"
echo " 3. Start CoreDNS: sudo systemctl start coredns"
echo ""

View File

@ -1,379 +0,0 @@
#!/bin/bash
# Production Cluster Health Check Script
# Tests RQLite, IPFS, and IPFS Cluster connectivity and replication
# Note: We don't use 'set -e' here because we want to continue testing even if individual checks fail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Node IPs - Update these if needed
BOOTSTRAP="${BOOTSTRAP:-51.83.128.181}"
NODE1="${NODE1:-57.128.223.92}"
NODE2="${NODE2:-185.185.83.89}"
ALL_NODES=($BOOTSTRAP $NODE1 $NODE2)
# Counters
PASSED=0
FAILED=0
WARNINGS=0
# Helper functions
print_header() {
echo ""
echo -e "${BLUE}========================================${NC}"
echo -e "${BLUE}$1${NC}"
echo -e "${BLUE}========================================${NC}"
}
print_test() {
echo -e "${YELLOW}$1${NC}"
}
print_pass() {
echo -e "${GREEN}$1${NC}"
PASSED=$((PASSED + 1))
}
print_fail() {
echo -e "${RED}$1${NC}"
FAILED=$((FAILED + 1))
}
print_warn() {
echo -e "${YELLOW}$1${NC}"
WARNINGS=$((WARNINGS + 1))
}
print_info() {
echo -e " $1"
}
# Test functions
test_rqlite_status() {
print_header "1. RQLITE CLUSTER STATUS"
local leader_found=false
local follower_count=0
local commit_indices=()
for i in "${!ALL_NODES[@]}"; do
local node="${ALL_NODES[$i]}"
print_test "Testing RQLite on $node"
if ! response=$(curl -s --max-time 5 http://$node:5001/status 2>/dev/null); then
print_fail "Cannot connect to RQLite on $node:5001"
continue
fi
local state=$(echo "$response" | jq -r '.store.raft.state // "unknown"')
local num_peers=$(echo "$response" | jq -r '.store.raft.num_peers // 0')
local commit_index=$(echo "$response" | jq -r '.store.raft.commit_index // 0')
local last_contact=$(echo "$response" | jq -r '.store.raft.last_contact // "N/A"')
local config=$(echo "$response" | jq -r '.store.raft.latest_configuration // "[]"')
local node_count=$(echo "$config" | grep -o "Address" | wc -l | tr -d ' ')
commit_indices+=($commit_index)
print_info "State: $state | Peers: $num_peers | Commit Index: $commit_index | Cluster Nodes: $node_count"
# Check state
if [ "$state" = "Leader" ]; then
leader_found=true
print_pass "Node $node is the Leader"
elif [ "$state" = "Follower" ]; then
follower_count=$((follower_count + 1))
# Check last contact
if [ "$last_contact" != "N/A" ] && [ "$last_contact" != "0" ]; then
print_pass "Node $node is a Follower (last contact: $last_contact)"
else
print_warn "Node $node is Follower but last_contact is $last_contact"
fi
else
print_fail "Node $node has unexpected state: $state"
fi
# Check peer count
if [ "$num_peers" = "2" ]; then
print_pass "Node $node has correct peer count: 2"
else
print_fail "Node $node has incorrect peer count: $num_peers (expected 2)"
fi
# Check cluster configuration
if [ "$node_count" = "3" ]; then
print_pass "Node $node sees all 3 cluster members"
else
print_fail "Node $node only sees $node_count cluster members (expected 3)"
fi
echo ""
done
# Check for exactly 1 leader
if [ "$leader_found" = true ] && [ "$follower_count" = "2" ]; then
print_pass "Cluster has 1 Leader and 2 Followers ✓"
else
print_fail "Invalid cluster state (Leader found: $leader_found, Followers: $follower_count)"
fi
# Check commit index sync
if [ ${#commit_indices[@]} -eq 3 ]; then
local first="${commit_indices[0]}"
local all_same=true
for idx in "${commit_indices[@]}"; do
if [ "$idx" != "$first" ]; then
all_same=false
break
fi
done
if [ "$all_same" = true ]; then
print_pass "All nodes have synced commit index: $first"
else
print_warn "Commit indices differ: ${commit_indices[*]} (might be normal if writes are happening)"
fi
fi
}
test_rqlite_replication() {
print_header "2. RQLITE REPLICATION TEST"
print_test "Creating test table and inserting data on leader ($BOOTSTRAP)"
# Create table
if ! response=$(curl -s --max-time 5 -XPOST "http://$BOOTSTRAP:5001/db/execute" \
-H "Content-Type: application/json" \
-d '[["CREATE TABLE IF NOT EXISTS test_cluster_health (id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp TEXT, node TEXT, value TEXT)"]]' 2>/dev/null); then
print_fail "Failed to create table"
return
fi
if echo "$response" | jq -e '.results[0].error' >/dev/null 2>&1; then
local error=$(echo "$response" | jq -r '.results[0].error')
if [[ "$error" != "table test_cluster_health already exists" ]]; then
print_fail "Table creation error: $error"
return
fi
fi
print_pass "Table exists"
# Insert test data
local test_value="test_$(date +%s)"
if ! response=$(curl -s --max-time 5 -XPOST "http://$BOOTSTRAP:5001/db/execute" \
-H "Content-Type: application/json" \
-d "[
[\"INSERT INTO test_cluster_health (timestamp, node, value) VALUES (datetime('now'), 'bootstrap', '$test_value')\"]
]" 2>/dev/null); then
print_fail "Failed to insert data"
return
fi
if echo "$response" | jq -e '.results[0].error' >/dev/null 2>&1; then
local error=$(echo "$response" | jq -r '.results[0].error')
print_fail "Insert error: $error"
return
fi
print_pass "Data inserted: $test_value"
# Wait for replication
print_info "Waiting 2 seconds for replication..."
sleep 2
# Query from all nodes
for node in "${ALL_NODES[@]}"; do
print_test "Reading from $node"
if ! response=$(curl -s --max-time 5 -XPOST "http://$node:5001/db/query?level=weak" \
-H "Content-Type: application/json" \
-d "[\"SELECT * FROM test_cluster_health WHERE value = '$test_value' LIMIT 1\"]" 2>/dev/null); then
print_fail "Failed to query from $node"
continue
fi
if echo "$response" | jq -e '.results[0].error' >/dev/null 2>&1; then
local error=$(echo "$response" | jq -r '.results[0].error')
print_fail "Query error on $node: $error"
continue
fi
local row_count=$(echo "$response" | jq -r '.results[0].values | length // 0')
if [ "$row_count" = "1" ]; then
local retrieved_value=$(echo "$response" | jq -r '.results[0].values[0][3] // ""')
if [ "$retrieved_value" = "$test_value" ]; then
print_pass "Data replicated correctly to $node"
else
print_fail "Data mismatch on $node (got: $retrieved_value, expected: $test_value)"
fi
else
print_fail "Expected 1 row from $node, got $row_count"
fi
done
}
test_ipfs_status() {
print_header "3. IPFS DAEMON STATUS"
for node in "${ALL_NODES[@]}"; do
print_test "Testing IPFS on $node"
if ! response=$(curl -s --max-time 5 -X POST http://$node:4501/api/v0/id 2>/dev/null); then
print_fail "Cannot connect to IPFS on $node:4501"
continue
fi
local peer_id=$(echo "$response" | jq -r '.ID // "unknown"')
local addr_count=$(echo "$response" | jq -r '.Addresses | length // 0')
local agent=$(echo "$response" | jq -r '.AgentVersion // "unknown"')
if [ "$peer_id" != "unknown" ]; then
print_pass "IPFS running on $node (ID: ${peer_id:0:12}...)"
print_info "Agent: $agent | Addresses: $addr_count"
else
print_fail "IPFS not responding correctly on $node"
fi
done
}
test_ipfs_swarm() {
print_header "4. IPFS SWARM CONNECTIVITY"
for node in "${ALL_NODES[@]}"; do
print_test "Checking IPFS swarm peers on $node"
if ! response=$(curl -s --max-time 5 -X POST http://$node:4501/api/v0/swarm/peers 2>/dev/null); then
print_fail "Failed to get swarm peers from $node"
continue
fi
local peer_count=$(echo "$response" | jq -r '.Peers | length // 0')
if [ "$peer_count" = "2" ]; then
print_pass "Node $node connected to 2 IPFS peers"
elif [ "$peer_count" -gt "0" ]; then
print_warn "Node $node connected to $peer_count IPFS peers (expected 2)"
else
print_fail "Node $node has no IPFS swarm peers"
fi
done
}
test_ipfs_cluster_status() {
print_header "5. IPFS CLUSTER STATUS"
for node in "${ALL_NODES[@]}"; do
print_test "Testing IPFS Cluster on $node"
if ! response=$(curl -s --max-time 5 http://$node:9094/id 2>/dev/null); then
print_fail "Cannot connect to IPFS Cluster on $node:9094"
continue
fi
local cluster_id=$(echo "$response" | jq -r '.id // "unknown"')
local cluster_peers=$(echo "$response" | jq -r '.cluster_peers | length // 0')
local version=$(echo "$response" | jq -r '.version // "unknown"')
if [ "$cluster_id" != "unknown" ]; then
print_pass "IPFS Cluster running on $node (ID: ${cluster_id:0:12}...)"
print_info "Version: $version | Cluster Peers: $cluster_peers"
if [ "$cluster_peers" = "3" ]; then
print_pass "Node $node sees all 3 cluster peers"
else
print_warn "Node $node sees $cluster_peers cluster peers (expected 3)"
fi
else
print_fail "IPFS Cluster not responding correctly on $node"
fi
done
}
test_ipfs_cluster_pins() {
print_header "6. IPFS CLUSTER PIN CONSISTENCY"
local pin_counts=()
for node in "${ALL_NODES[@]}"; do
print_test "Checking pins on $node"
if ! response=$(curl -s --max-time 5 http://$node:9094/pins 2>/dev/null); then
print_fail "Failed to get pins from $node"
pin_counts+=(0)
continue
fi
local pin_count=$(echo "$response" | jq -r 'length // 0')
pin_counts+=($pin_count)
print_pass "Node $node has $pin_count pins"
done
# Check if all nodes have same pin count
if [ ${#pin_counts[@]} -eq 3 ]; then
local first="${pin_counts[0]}"
local all_same=true
for count in "${pin_counts[@]}"; do
if [ "$count" != "$first" ]; then
all_same=false
break
fi
done
if [ "$all_same" = true ]; then
print_pass "All nodes have consistent pin count: $first"
else
print_warn "Pin counts differ: ${pin_counts[*]} (might be syncing)"
fi
fi
}
print_summary() {
print_header "TEST SUMMARY"
echo ""
echo -e "${GREEN}Passed: $PASSED${NC}"
echo -e "${YELLOW}Warnings: $WARNINGS${NC}"
echo -e "${RED}Failed: $FAILED${NC}"
echo ""
if [ $FAILED -eq 0 ]; then
echo -e "${GREEN}🎉 All critical tests passed! Cluster is healthy.${NC}"
exit 0
elif [ $FAILED -le 2 ]; then
echo -e "${YELLOW}⚠️ Some tests failed. Review the output above.${NC}"
exit 1
else
echo -e "${RED}❌ Multiple failures detected. Cluster needs attention.${NC}"
exit 2
fi
}
# Main execution
main() {
echo ""
echo -e "${BLUE}╔════════════════════════════════════════════╗${NC}"
echo -e "${BLUE}║ DEBROS Production Cluster Health Check ║${NC}"
echo -e "${BLUE}╚════════════════════════════════════════════╝${NC}"
echo ""
echo "Testing cluster:"
echo " Bootstrap: $BOOTSTRAP"
echo " Node 1: $NODE1"
echo " Node 2: $NODE2"
test_rqlite_status
test_rqlite_replication
test_ipfs_status
test_ipfs_swarm
test_ipfs_cluster_status
test_ipfs_cluster_pins
print_summary
}
# Run main
main

View File

@ -1 +0,0 @@
agreed