mirror of
https://github.com/DeBrosOfficial/orama.git
synced 2026-03-17 04:33:00 +00:00
Merge pull request #81 from DeBrosOfficial/cleanup/dead-code
Cleanup/dead code
This commit is contained in:
commit
051c002ec8
@ -1,6 +0,0 @@
|
||||
# THIS IS AUTOGENERATED. DO NOT EDIT MANUALLY
|
||||
version = 1
|
||||
name = "network"
|
||||
|
||||
[setup]
|
||||
script = "export MCP_BEARER_TOKEN=\"ra_9941ab97eb51668394a68963a2ab6fead0ca942afe437a6e2f4a520efcb24036\""
|
||||
7
.gitignore
vendored
7
.gitignore
vendored
@ -100,4 +100,9 @@ vps.txt
|
||||
|
||||
bin-linux/
|
||||
|
||||
website/
|
||||
website/
|
||||
|
||||
terms-agreement
|
||||
|
||||
cli
|
||||
./inspector
|
||||
48
Makefile
48
Makefile
@ -84,9 +84,9 @@ test-e2e-quick:
|
||||
# Network - Distributed P2P Database System
|
||||
# Makefile for development and build tasks
|
||||
|
||||
.PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill
|
||||
.PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill redeploy-devnet redeploy-testnet release health
|
||||
|
||||
VERSION := 0.101.6
|
||||
VERSION := 0.102.0
|
||||
COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown)
|
||||
DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||
LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)'
|
||||
@ -196,6 +196,42 @@ stop:
|
||||
kill:
|
||||
@bash scripts/dev-kill-all.sh
|
||||
|
||||
# Deploy to devnet (build + rolling upgrade all nodes)
|
||||
redeploy-devnet:
|
||||
@bash scripts/redeploy.sh --devnet
|
||||
|
||||
# Deploy to devnet without rebuilding
|
||||
redeploy-devnet-quick:
|
||||
@bash scripts/redeploy.sh --devnet --no-build
|
||||
|
||||
# Deploy to testnet (build + rolling upgrade all nodes)
|
||||
redeploy-testnet:
|
||||
@bash scripts/redeploy.sh --testnet
|
||||
|
||||
# Deploy to testnet without rebuilding
|
||||
redeploy-testnet-quick:
|
||||
@bash scripts/redeploy.sh --testnet --no-build
|
||||
|
||||
# Interactive release workflow (tag + push)
|
||||
release:
|
||||
@bash scripts/release.sh
|
||||
|
||||
# Check health of all nodes in an environment
|
||||
# Usage: make health ENV=devnet
|
||||
health:
|
||||
@if [ -z "$(ENV)" ]; then \
|
||||
echo "Usage: make health ENV=devnet|testnet"; \
|
||||
exit 1; \
|
||||
fi
|
||||
@while IFS='|' read -r env host pass role key; do \
|
||||
[ -z "$$env" ] && continue; \
|
||||
case "$$env" in \#*) continue;; esac; \
|
||||
env="$$(echo "$$env" | xargs)"; \
|
||||
[ "$$env" != "$(ENV)" ] && continue; \
|
||||
role="$$(echo "$$role" | xargs)"; \
|
||||
bash scripts/check-node-health.sh "$$host" "$$pass" "$$host ($$role)"; \
|
||||
done < scripts/remote-nodes.conf
|
||||
|
||||
# Help
|
||||
help:
|
||||
@echo "Available targets:"
|
||||
@ -225,6 +261,14 @@ help:
|
||||
@echo " Example production test:"
|
||||
@echo " ORAMA_GATEWAY_URL=https://dbrs.space make test-e2e-prod"
|
||||
@echo ""
|
||||
@echo "Deployment:"
|
||||
@echo " make redeploy-devnet - Build + rolling deploy to all devnet nodes"
|
||||
@echo " make redeploy-devnet-quick - Deploy to devnet without rebuilding"
|
||||
@echo " make redeploy-testnet - Build + rolling deploy to all testnet nodes"
|
||||
@echo " make redeploy-testnet-quick- Deploy to testnet without rebuilding"
|
||||
@echo " make health ENV=devnet - Check health of all nodes in an environment"
|
||||
@echo " make release - Interactive release workflow (tag + push)"
|
||||
@echo ""
|
||||
@echo "Development Management (via orama):"
|
||||
@echo " ./bin/orama dev status - Show status of all dev services"
|
||||
@echo " ./bin/orama dev logs <component> [--follow]"
|
||||
|
||||
@ -88,6 +88,10 @@ func main() {
|
||||
case "db":
|
||||
cli.HandleDBCommand(args)
|
||||
|
||||
// Cluster inspection
|
||||
case "inspect":
|
||||
cli.HandleInspectCommand(args)
|
||||
|
||||
// Namespace management
|
||||
case "namespace":
|
||||
cli.HandleNamespaceCommand(args)
|
||||
@ -173,6 +177,12 @@ func showHelp() {
|
||||
fmt.Printf("🏢 Namespaces:\n")
|
||||
fmt.Printf(" namespace delete - Delete current namespace and all resources\n\n")
|
||||
|
||||
fmt.Printf("🔍 Cluster Inspection:\n")
|
||||
fmt.Printf(" inspect - Inspect cluster health via SSH\n")
|
||||
fmt.Printf(" inspect --env devnet - Inspect devnet nodes\n")
|
||||
fmt.Printf(" inspect --subsystem rqlite - Inspect only RQLite subsystem\n")
|
||||
fmt.Printf(" inspect --format json - Output as JSON\n\n")
|
||||
|
||||
fmt.Printf("🌍 Environments:\n")
|
||||
fmt.Printf(" env list - List all environments\n")
|
||||
fmt.Printf(" env current - Show current environment\n")
|
||||
|
||||
@ -14,10 +14,6 @@ import (
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// For transition, alias main.GatewayConfig to pkg/gateway.Config
|
||||
// server.go will be removed; this keeps compatibility until then.
|
||||
type GatewayConfig = gateway.Config
|
||||
|
||||
func getEnvDefault(key, def string) string {
|
||||
if v := os.Getenv(key); strings.TrimSpace(v) != "" {
|
||||
return v
|
||||
|
||||
11
cmd/inspector/main.go
Normal file
11
cmd/inspector/main.go
Normal file
@ -0,0 +1,11 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/cli"
|
||||
)
|
||||
|
||||
func main() {
|
||||
cli.HandleInspectCommand(os.Args[1:])
|
||||
}
|
||||
160
docs/COMMON_PROBLEMS.md
Normal file
160
docs/COMMON_PROBLEMS.md
Normal file
@ -0,0 +1,160 @@
|
||||
# Common Problems & Solutions
|
||||
|
||||
Troubleshooting guide for known issues in the Orama Network.
|
||||
|
||||
---
|
||||
|
||||
## 1. Namespace Gateway: "Olric unavailable"
|
||||
|
||||
**Symptom:** `ns-<name>.orama-devnet.network/v1/health` returns `"olric": {"status": "unavailable"}`.
|
||||
|
||||
**Cause:** The Olric memberlist gossip between namespace nodes is broken. Olric uses UDP pings for health checks — if those fail, the cluster can't bootstrap and the gateway reports Olric as unavailable.
|
||||
|
||||
### Check 1: WireGuard packet loss between nodes
|
||||
|
||||
SSH into each node and ping the other namespace nodes over WireGuard:
|
||||
|
||||
```bash
|
||||
ping -c 10 -W 2 10.0.0.X # replace with the WG IP of each peer
|
||||
```
|
||||
|
||||
If you see packet loss over WireGuard but **not** over the public IP (`ping <public-ip>`), the WireGuard peer session is corrupted.
|
||||
|
||||
**Fix — Reset the WireGuard peer on both sides:**
|
||||
|
||||
```bash
|
||||
# On Node A — replace <pubkey> and <endpoint> with Node B's values
|
||||
wg set wg0 peer <NodeB-pubkey> remove
|
||||
wg set wg0 peer <NodeB-pubkey> endpoint <NodeB-public-ip>:51820 allowed-ips <NodeB-wg-ip>/32 persistent-keepalive 25
|
||||
|
||||
# On Node B — same but with Node A's values
|
||||
wg set wg0 peer <NodeA-pubkey> remove
|
||||
wg set wg0 peer <NodeA-pubkey> endpoint <NodeA-public-ip>:51820 allowed-ips <NodeA-wg-ip>/32 persistent-keepalive 25
|
||||
```
|
||||
|
||||
Then restart services: `sudo orama prod restart`
|
||||
|
||||
You can find peer public keys with `wg show wg0`.
|
||||
|
||||
### Check 2: Olric bound to 0.0.0.0 instead of WireGuard IP
|
||||
|
||||
Check the Olric config on each node:
|
||||
|
||||
```bash
|
||||
cat /home/debros/.orama/data/namespaces/<name>/configs/olric-*.yaml
|
||||
```
|
||||
|
||||
If `bindAddr` is `0.0.0.0`, the node will try to bind to IPv6 on dual-stack hosts, breaking memberlist gossip.
|
||||
|
||||
**Fix:** Edit the YAML to use the node's WireGuard IP (run `ip addr show wg0` to find it), then restart: `sudo orama prod restart`
|
||||
|
||||
This was fixed in code (BindAddr validation in `SpawnOlric`), so new namespaces won't have this issue.
|
||||
|
||||
### Check 3: Olric logs show "Failed UDP ping" constantly
|
||||
|
||||
```bash
|
||||
journalctl -u debros-namespace-olric@<name>.service --no-pager -n 30
|
||||
```
|
||||
|
||||
If every UDP ping fails but TCP stream connections succeed, it's the WireGuard packet loss issue (see Check 1).
|
||||
|
||||
---
|
||||
|
||||
## 2. Namespace Gateway: Missing config fields
|
||||
|
||||
**Symptom:** Gateway config YAML is missing `global_rqlite_dsn`, has `olric_timeout: 0s`, or `olric_servers` only lists `localhost`.
|
||||
|
||||
**Cause:** Before the spawn handler fix, `spawnGatewayRemote()` didn't send `global_rqlite_dsn` or `olric_timeout` to remote nodes.
|
||||
|
||||
**Fix:** Edit the gateway config manually:
|
||||
|
||||
```bash
|
||||
vim /home/debros/.orama/data/namespaces/<name>/configs/gateway-*.yaml
|
||||
```
|
||||
|
||||
Add/fix:
|
||||
```yaml
|
||||
global_rqlite_dsn: "http://10.0.0.X:10001"
|
||||
olric_timeout: 30s
|
||||
olric_servers:
|
||||
- "10.0.0.X:10002"
|
||||
- "10.0.0.Y:10002"
|
||||
- "10.0.0.Z:10002"
|
||||
```
|
||||
|
||||
Then: `sudo orama prod restart`
|
||||
|
||||
This was fixed in code, so new namespaces get the correct config.
|
||||
|
||||
---
|
||||
|
||||
## 3. Namespace not restoring after restart (missing cluster-state.json)
|
||||
|
||||
**Symptom:** After `orama prod restart`, the namespace services don't come back because `RestoreLocalClustersFromDisk` has no state file.
|
||||
|
||||
**Check:**
|
||||
|
||||
```bash
|
||||
ls /home/debros/.orama/data/namespaces/<name>/cluster-state.json
|
||||
```
|
||||
|
||||
If the file doesn't exist, the node can't restore the namespace.
|
||||
|
||||
**Fix:** Create the file manually from another node that has it, or reconstruct it. The format is:
|
||||
|
||||
```json
|
||||
{
|
||||
"namespace": "<name>",
|
||||
"rqlite": { "http_port": 10001, "raft_port": 10000, ... },
|
||||
"olric": { "http_port": 10002, "memberlist_port": 10003, ... },
|
||||
"gateway": { "http_port": 10004, ... }
|
||||
}
|
||||
```
|
||||
|
||||
This was fixed in code — `ProvisionCluster` now saves state to all nodes (including remote ones via the `save-cluster-state` spawn action).
|
||||
|
||||
---
|
||||
|
||||
## 4. Namespace gateway processes not restarting after upgrade
|
||||
|
||||
**Symptom:** After `orama upgrade --restart` or `orama prod restart`, namespace gateway/olric/rqlite services don't start.
|
||||
|
||||
**Cause:** `orama prod stop` disables systemd template services (`debros-namespace-gateway@<name>.service`). They have `PartOf=debros-node.service`, but that only propagates restart to **enabled** services.
|
||||
|
||||
**Fix:** Re-enable the services before restarting:
|
||||
|
||||
```bash
|
||||
systemctl enable debros-namespace-rqlite@<name>.service
|
||||
systemctl enable debros-namespace-olric@<name>.service
|
||||
systemctl enable debros-namespace-gateway@<name>.service
|
||||
sudo orama prod restart
|
||||
```
|
||||
|
||||
This was fixed in code — the upgrade orchestrator now re-enables `@` services before restarting.
|
||||
|
||||
---
|
||||
|
||||
## 5. SSH commands eating stdin inside heredocs
|
||||
|
||||
**Symptom:** When running a script that SSHes into multiple nodes inside a heredoc (`<<'EOS'`), only the first SSH command runs — the rest are silently skipped.
|
||||
|
||||
**Cause:** `ssh` reads from stdin, consuming the rest of the heredoc.
|
||||
|
||||
**Fix:** Add `-n` flag to all `ssh` calls inside heredocs:
|
||||
|
||||
```bash
|
||||
ssh -n user@host 'command'
|
||||
```
|
||||
|
||||
`scp` is not affected (doesn't read stdin).
|
||||
|
||||
---
|
||||
|
||||
## General Debugging Tips
|
||||
|
||||
- **Always use `sudo orama prod restart`** instead of raw `systemctl` commands
|
||||
- **Namespace data lives at:** `/home/debros/.orama/data/namespaces/<name>/`
|
||||
- **Check service logs:** `journalctl -u debros-namespace-olric@<name>.service --no-pager -n 50`
|
||||
- **Check WireGuard:** `wg show wg0` — look for recent handshakes and transfer bytes
|
||||
- **Check gateway health:** `curl http://localhost:<port>/v1/health` from the node itself
|
||||
- **Node IPs:** Check `scripts/remote-nodes.conf` for credentials, `wg show wg0` for WG IPs
|
||||
213
docs/INSPECTOR.md
Normal file
213
docs/INSPECTOR.md
Normal file
@ -0,0 +1,213 @@
|
||||
# Inspector
|
||||
|
||||
The inspector is a cluster health check tool that SSHs into every node, collects subsystem data in parallel, runs deterministic checks, and optionally sends failures to an AI model for root-cause analysis.
|
||||
|
||||
## Pipeline
|
||||
|
||||
```
|
||||
Collect (parallel SSH) → Check (deterministic Go) → Report (table/JSON) → Analyze (optional AI)
|
||||
```
|
||||
|
||||
1. **Collect** — SSH into every node in parallel, run diagnostic commands, parse results into structured data.
|
||||
2. **Check** — Run pure Go check functions against the collected data. Each check produces a pass/fail/warn/skip result with a severity level.
|
||||
3. **Report** — Print results as a table (default) or JSON. Failures sort first, grouped by subsystem.
|
||||
4. **Analyze** — If `--ai` is enabled and there are failures or warnings, send them to an LLM via OpenRouter for root-cause analysis.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Inspect all subsystems on devnet
|
||||
orama inspect --env devnet
|
||||
|
||||
# Inspect only RQLite
|
||||
orama inspect --env devnet --subsystem rqlite
|
||||
|
||||
# JSON output
|
||||
orama inspect --env devnet --format json
|
||||
|
||||
# With AI analysis
|
||||
orama inspect --env devnet --ai
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
```
|
||||
orama inspect [flags]
|
||||
```
|
||||
|
||||
| Flag | Default | Description |
|
||||
|------|---------|-------------|
|
||||
| `--config` | `scripts/remote-nodes.conf` | Path to node configuration file |
|
||||
| `--env` | *(required)* | Environment to inspect (`devnet`, `testnet`) |
|
||||
| `--subsystem` | `all` | Comma-separated subsystems to inspect |
|
||||
| `--format` | `table` | Output format: `table` or `json` |
|
||||
| `--timeout` | `30s` | SSH command timeout per node |
|
||||
| `--verbose` | `false` | Print collection progress |
|
||||
| `--ai` | `false` | Enable AI analysis of failures |
|
||||
| `--model` | `moonshotai/kimi-k2.5` | OpenRouter model for AI analysis |
|
||||
| `--api-key` | `$OPENROUTER_API_KEY` | OpenRouter API key |
|
||||
|
||||
### Subsystem Names
|
||||
|
||||
`rqlite`, `olric`, `ipfs`, `dns`, `wireguard` (alias: `wg`), `system`, `network`, `namespace`
|
||||
|
||||
Multiple subsystems can be combined: `--subsystem rqlite,olric,dns`
|
||||
|
||||
## Subsystems
|
||||
|
||||
| Subsystem | What It Checks |
|
||||
|-----------|---------------|
|
||||
| **rqlite** | Raft state, leader election, readyz, commit/applied gap, FSM pending, strong reads, debug vars (query errors, leader_not_found, snapshots), cross-node leader agreement, term consistency, applied index convergence, quorum, version match |
|
||||
| **olric** | Service active, memberlist up, restart count, memory usage, log analysis (suspects, flapping, errors), cross-node memberlist consistency |
|
||||
| **ipfs** | Daemon active, cluster active, swarm peer count, cluster peer count, cluster errors, repo usage %, swarm key present, bootstrap list empty, cross-node version consistency |
|
||||
| **dns** | CoreDNS active, Caddy active, ports (53/80/443), memory, restart count, log errors, Corefile exists, SOA/NS/wildcard/base-A resolution, TLS cert expiry, cross-node nameserver availability |
|
||||
| **wireguard** | Interface up, service active, correct 10.0.0.x IP, listen port 51820, peer count vs expected, MTU 1420, config exists + permissions 600, peer handshakes (fresh/stale/never), peer traffic, catch-all route detection, cross-node peer count + MTU consistency |
|
||||
| **system** | Core services (debros-node, rqlite, olric, ipfs, ipfs-cluster, wg-quick), nameserver services (coredns, caddy), failed systemd units, memory/disk/inode usage, load average, OOM kills, swap, UFW active, process user (debros), panic count, expected ports |
|
||||
| **network** | Internet reachability, default route, WireGuard route, TCP connection count, TIME_WAIT count, TCP retransmission rate, WireGuard mesh ping (all peers) |
|
||||
| **namespace** | Per-namespace: RQLite up + raft state + readyz, Olric memberlist, Gateway HTTP health. Cross-namespace: all-healthy check, RQLite quorum per namespace |
|
||||
|
||||
## Severity Levels
|
||||
|
||||
| Level | When Used |
|
||||
|-------|-----------|
|
||||
| **CRITICAL** | Service completely down. Raft quorum lost, RQLite unresponsive, no leader. |
|
||||
| **HIGH** | Service degraded. Olric down, gateway not responding, IPFS swarm key missing. |
|
||||
| **MEDIUM** | Non-ideal but functional. Stale handshakes, elevated memory, log suspects. |
|
||||
| **LOW** | Informational. Non-standard MTU, port mismatch, version skew. |
|
||||
|
||||
## Check Statuses
|
||||
|
||||
| Status | Meaning |
|
||||
|--------|---------|
|
||||
| **pass** | Check passed. |
|
||||
| **fail** | Check failed — action needed. |
|
||||
| **warn** | Degraded — monitor or investigate. |
|
||||
| **skip** | Check could not run (insufficient data). |
|
||||
|
||||
## Output Formats
|
||||
|
||||
### Table (default)
|
||||
|
||||
```
|
||||
Inspecting 14 devnet nodes...
|
||||
|
||||
## RQLITE
|
||||
----------------------------------------------------------------------
|
||||
OK [CRITICAL] RQLite responding (ubuntu@10.0.0.1)
|
||||
responsive=true version=v8.36.16
|
||||
FAIL [CRITICAL] Cluster has exactly one leader
|
||||
leaders=0 (NO LEADER)
|
||||
...
|
||||
|
||||
======================================================================
|
||||
Summary: 800 passed, 12 failed, 31 warnings, 0 skipped (4.2s)
|
||||
```
|
||||
|
||||
Failures sort first, then warnings, then passes. Within each group, higher severity checks appear first.
|
||||
|
||||
### JSON (`--format json`)
|
||||
|
||||
```json
|
||||
{
|
||||
"summary": {
|
||||
"passed": 800,
|
||||
"failed": 12,
|
||||
"warned": 31,
|
||||
"skipped": 0,
|
||||
"total": 843,
|
||||
"duration_seconds": 4.2
|
||||
},
|
||||
"checks": [
|
||||
{
|
||||
"id": "rqlite.responsive",
|
||||
"name": "RQLite responding",
|
||||
"subsystem": "rqlite",
|
||||
"severity": 3,
|
||||
"status": "pass",
|
||||
"message": "responsive=true version=v8.36.16",
|
||||
"node": "ubuntu@10.0.0.1"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## AI Analysis
|
||||
|
||||
When `--ai` is enabled, failures and warnings are sent to an LLM via OpenRouter for root-cause analysis.
|
||||
|
||||
```bash
|
||||
# Use default model (kimi-k2.5)
|
||||
orama inspect --env devnet --ai
|
||||
|
||||
# Use a different model
|
||||
orama inspect --env devnet --ai --model openai/gpt-4o
|
||||
|
||||
# Pass API key directly
|
||||
orama inspect --env devnet --ai --api-key sk-or-...
|
||||
```
|
||||
|
||||
The API key can be set via:
|
||||
1. `--api-key` flag
|
||||
2. `OPENROUTER_API_KEY` environment variable
|
||||
3. `.env` file in the current directory
|
||||
|
||||
The AI receives the full check results plus cluster metadata and returns a structured analysis with likely root causes and suggested fixes.
|
||||
|
||||
## Exit Codes
|
||||
|
||||
| Code | Meaning |
|
||||
|------|---------|
|
||||
| `0` | All checks passed (or only warnings). |
|
||||
| `1` | At least one check failed. |
|
||||
|
||||
## Configuration
|
||||
|
||||
The inspector reads node definitions from a pipe-delimited config file (default: `scripts/remote-nodes.conf`).
|
||||
|
||||
### Format
|
||||
|
||||
```
|
||||
# environment|user@host|password|role|ssh_key
|
||||
devnet|ubuntu@1.2.3.4|mypassword|node|
|
||||
devnet|ubuntu@5.6.7.8|mypassword|nameserver-ns1|/path/to/key
|
||||
```
|
||||
|
||||
| Field | Description |
|
||||
|-------|-------------|
|
||||
| `environment` | Cluster name (`devnet`, `testnet`) |
|
||||
| `user@host` | SSH credentials |
|
||||
| `password` | SSH password |
|
||||
| `role` | `node` or `nameserver-ns1`, `nameserver-ns2`, etc. |
|
||||
| `ssh_key` | Optional path to SSH private key |
|
||||
|
||||
Blank lines and lines starting with `#` are ignored.
|
||||
|
||||
### Node Roles
|
||||
|
||||
- **`node`** — Regular cluster node. Runs RQLite, Olric, IPFS, WireGuard, namespaces.
|
||||
- **`nameserver-*`** — DNS nameserver. Runs CoreDNS + Caddy in addition to base services. System checks verify nameserver-specific services.
|
||||
|
||||
## Examples
|
||||
|
||||
```bash
|
||||
# Full cluster inspection
|
||||
orama inspect --env devnet
|
||||
|
||||
# Check only networking
|
||||
orama inspect --env devnet --subsystem wireguard,network
|
||||
|
||||
# Quick RQLite health check
|
||||
orama inspect --env devnet --subsystem rqlite
|
||||
|
||||
# Verbose mode (shows collection progress)
|
||||
orama inspect --env devnet --verbose
|
||||
|
||||
# JSON for scripting / piping
|
||||
orama inspect --env devnet --format json | jq '.checks[] | select(.status == "fail")'
|
||||
|
||||
# AI-assisted debugging
|
||||
orama inspect --env devnet --ai --model anthropic/claude-sonnet-4
|
||||
|
||||
# Custom config file
|
||||
orama inspect --config /path/to/nodes.conf --env testnet
|
||||
```
|
||||
@ -1,415 +0,0 @@
|
||||
//go:build e2e
|
||||
|
||||
package cluster_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/e2e"
|
||||
"github.com/DeBrosOfficial/network/pkg/ipfs"
|
||||
)
|
||||
|
||||
// Note: These tests connect directly to IPFS Cluster API (localhost:9094)
|
||||
// and IPFS API (localhost:4501). They are for local development only.
|
||||
// For production testing, use storage_http_test.go which uses gateway endpoints.
|
||||
|
||||
func TestIPFSCluster_Health(t *testing.T) {
|
||||
e2e.SkipIfProduction(t) // Direct IPFS connection not available in production
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
logger := e2e.NewTestLogger(t)
|
||||
cfg := ipfs.Config{
|
||||
ClusterAPIURL: e2e.GetIPFSClusterURL(),
|
||||
Timeout: 10 * time.Second,
|
||||
}
|
||||
|
||||
client, err := ipfs.NewClient(cfg, logger)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create IPFS client: %v", err)
|
||||
}
|
||||
|
||||
err = client.Health(ctx)
|
||||
if err != nil {
|
||||
t.Fatalf("health check failed: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIPFSCluster_GetPeerCount(t *testing.T) {
|
||||
e2e.SkipIfProduction(t)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
logger := e2e.NewTestLogger(t)
|
||||
cfg := ipfs.Config{
|
||||
ClusterAPIURL: e2e.GetIPFSClusterURL(),
|
||||
Timeout: 10 * time.Second,
|
||||
}
|
||||
|
||||
client, err := ipfs.NewClient(cfg, logger)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create IPFS client: %v", err)
|
||||
}
|
||||
|
||||
peerCount, err := client.GetPeerCount(ctx)
|
||||
if err != nil {
|
||||
t.Fatalf("get peer count failed: %v", err)
|
||||
}
|
||||
|
||||
if peerCount < 0 {
|
||||
t.Fatalf("expected non-negative peer count, got %d", peerCount)
|
||||
}
|
||||
|
||||
t.Logf("IPFS cluster peers: %d", peerCount)
|
||||
}
|
||||
|
||||
func TestIPFSCluster_AddFile(t *testing.T) {
|
||||
e2e.SkipIfProduction(t)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
logger := e2e.NewTestLogger(t)
|
||||
cfg := ipfs.Config{
|
||||
ClusterAPIURL: e2e.GetIPFSClusterURL(),
|
||||
Timeout: 30 * time.Second,
|
||||
}
|
||||
|
||||
client, err := ipfs.NewClient(cfg, logger)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create IPFS client: %v", err)
|
||||
}
|
||||
|
||||
content := []byte("IPFS cluster test content")
|
||||
result, err := client.Add(ctx, bytes.NewReader(content), "test.txt")
|
||||
if err != nil {
|
||||
t.Fatalf("add file failed: %v", err)
|
||||
}
|
||||
|
||||
if result.Cid == "" {
|
||||
t.Fatalf("expected non-empty CID")
|
||||
}
|
||||
|
||||
if result.Size != int64(len(content)) {
|
||||
t.Fatalf("expected size %d, got %d", len(content), result.Size)
|
||||
}
|
||||
|
||||
t.Logf("Added file with CID: %s", result.Cid)
|
||||
}
|
||||
|
||||
func TestIPFSCluster_PinFile(t *testing.T) {
|
||||
e2e.SkipIfProduction(t)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
logger := e2e.NewTestLogger(t)
|
||||
cfg := ipfs.Config{
|
||||
ClusterAPIURL: e2e.GetIPFSClusterURL(),
|
||||
Timeout: 30 * time.Second,
|
||||
}
|
||||
|
||||
client, err := ipfs.NewClient(cfg, logger)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create IPFS client: %v", err)
|
||||
}
|
||||
|
||||
// Add file first
|
||||
content := []byte("IPFS pin test content")
|
||||
addResult, err := client.Add(ctx, bytes.NewReader(content), "pin-test.txt")
|
||||
if err != nil {
|
||||
t.Fatalf("add file failed: %v", err)
|
||||
}
|
||||
|
||||
cid := addResult.Cid
|
||||
|
||||
// Pin the file
|
||||
pinResult, err := client.Pin(ctx, cid, "pinned-file", 1)
|
||||
if err != nil {
|
||||
t.Fatalf("pin file failed: %v", err)
|
||||
}
|
||||
|
||||
if pinResult.Cid != cid {
|
||||
t.Fatalf("expected cid %s, got %s", cid, pinResult.Cid)
|
||||
}
|
||||
|
||||
t.Logf("Pinned file: %s", cid)
|
||||
}
|
||||
|
||||
func TestIPFSCluster_PinStatus(t *testing.T) {
|
||||
e2e.SkipIfProduction(t)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
logger := e2e.NewTestLogger(t)
|
||||
cfg := ipfs.Config{
|
||||
ClusterAPIURL: e2e.GetIPFSClusterURL(),
|
||||
Timeout: 30 * time.Second,
|
||||
}
|
||||
|
||||
client, err := ipfs.NewClient(cfg, logger)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create IPFS client: %v", err)
|
||||
}
|
||||
|
||||
// Add and pin file
|
||||
content := []byte("IPFS status test content")
|
||||
addResult, err := client.Add(ctx, bytes.NewReader(content), "status-test.txt")
|
||||
if err != nil {
|
||||
t.Fatalf("add file failed: %v", err)
|
||||
}
|
||||
|
||||
cid := addResult.Cid
|
||||
|
||||
pinResult, err := client.Pin(ctx, cid, "status-test", 1)
|
||||
if err != nil {
|
||||
t.Fatalf("pin file failed: %v", err)
|
||||
}
|
||||
|
||||
if pinResult.Cid != cid {
|
||||
t.Fatalf("expected cid %s, got %s", cid, pinResult.Cid)
|
||||
}
|
||||
|
||||
// Give pin time to propagate
|
||||
e2e.Delay(1000)
|
||||
|
||||
// Get status
|
||||
status, err := client.PinStatus(ctx, cid)
|
||||
if err != nil {
|
||||
t.Fatalf("get pin status failed: %v", err)
|
||||
}
|
||||
|
||||
if status.Cid != cid {
|
||||
t.Fatalf("expected cid %s, got %s", cid, status.Cid)
|
||||
}
|
||||
|
||||
if status.Name != "status-test" {
|
||||
t.Fatalf("expected name 'status-test', got %s", status.Name)
|
||||
}
|
||||
|
||||
if status.ReplicationFactor < 1 {
|
||||
t.Logf("warning: replication factor is %d, expected >= 1", status.ReplicationFactor)
|
||||
}
|
||||
|
||||
t.Logf("Pin status: %s (replication: %d, peers: %d)", status.Status, status.ReplicationFactor, len(status.Peers))
|
||||
}
|
||||
|
||||
func TestIPFSCluster_UnpinFile(t *testing.T) {
|
||||
e2e.SkipIfProduction(t)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
logger := e2e.NewTestLogger(t)
|
||||
cfg := ipfs.Config{
|
||||
ClusterAPIURL: e2e.GetIPFSClusterURL(),
|
||||
Timeout: 30 * time.Second,
|
||||
}
|
||||
|
||||
client, err := ipfs.NewClient(cfg, logger)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create IPFS client: %v", err)
|
||||
}
|
||||
|
||||
// Add and pin file
|
||||
content := []byte("IPFS unpin test content")
|
||||
addResult, err := client.Add(ctx, bytes.NewReader(content), "unpin-test.txt")
|
||||
if err != nil {
|
||||
t.Fatalf("add file failed: %v", err)
|
||||
}
|
||||
|
||||
cid := addResult.Cid
|
||||
|
||||
_, err = client.Pin(ctx, cid, "unpin-test", 1)
|
||||
if err != nil {
|
||||
t.Fatalf("pin file failed: %v", err)
|
||||
}
|
||||
|
||||
// Unpin file
|
||||
err = client.Unpin(ctx, cid)
|
||||
if err != nil {
|
||||
t.Fatalf("unpin file failed: %v", err)
|
||||
}
|
||||
|
||||
t.Logf("Unpinned file: %s", cid)
|
||||
}
|
||||
|
||||
func TestIPFSCluster_GetFile(t *testing.T) {
|
||||
e2e.SkipIfProduction(t)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
logger := e2e.NewTestLogger(t)
|
||||
cfg := ipfs.Config{
|
||||
ClusterAPIURL: e2e.GetIPFSClusterURL(),
|
||||
Timeout: 30 * time.Second,
|
||||
}
|
||||
|
||||
client, err := ipfs.NewClient(cfg, logger)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create IPFS client: %v", err)
|
||||
}
|
||||
|
||||
// Add file
|
||||
content := []byte("IPFS get test content")
|
||||
addResult, err := client.Add(ctx, bytes.NewReader(content), "get-test.txt")
|
||||
if err != nil {
|
||||
t.Fatalf("add file failed: %v", err)
|
||||
}
|
||||
|
||||
cid := addResult.Cid
|
||||
|
||||
// Give time for propagation
|
||||
e2e.Delay(1000)
|
||||
|
||||
// Get file
|
||||
rc, err := client.Get(ctx, cid, e2e.GetIPFSAPIURL())
|
||||
if err != nil {
|
||||
t.Fatalf("get file failed: %v", err)
|
||||
}
|
||||
defer rc.Close()
|
||||
|
||||
retrievedContent, err := io.ReadAll(rc)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to read content: %v", err)
|
||||
}
|
||||
|
||||
if !bytes.Equal(retrievedContent, content) {
|
||||
t.Fatalf("content mismatch: expected %q, got %q", string(content), string(retrievedContent))
|
||||
}
|
||||
|
||||
t.Logf("Retrieved file: %s (%d bytes)", cid, len(retrievedContent))
|
||||
}
|
||||
|
||||
func TestIPFSCluster_LargeFile(t *testing.T) {
|
||||
e2e.SkipIfProduction(t)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
logger := e2e.NewTestLogger(t)
|
||||
cfg := ipfs.Config{
|
||||
ClusterAPIURL: e2e.GetIPFSClusterURL(),
|
||||
Timeout: 60 * time.Second,
|
||||
}
|
||||
|
||||
client, err := ipfs.NewClient(cfg, logger)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create IPFS client: %v", err)
|
||||
}
|
||||
|
||||
// Create 5MB file
|
||||
content := bytes.Repeat([]byte("x"), 5*1024*1024)
|
||||
result, err := client.Add(ctx, bytes.NewReader(content), "large.bin")
|
||||
if err != nil {
|
||||
t.Fatalf("add large file failed: %v", err)
|
||||
}
|
||||
|
||||
if result.Cid == "" {
|
||||
t.Fatalf("expected non-empty CID")
|
||||
}
|
||||
|
||||
if result.Size != int64(len(content)) {
|
||||
t.Fatalf("expected size %d, got %d", len(content), result.Size)
|
||||
}
|
||||
|
||||
t.Logf("Added large file with CID: %s (%d bytes)", result.Cid, result.Size)
|
||||
}
|
||||
|
||||
func TestIPFSCluster_ReplicationFactor(t *testing.T) {
|
||||
e2e.SkipIfProduction(t) // Direct IPFS connection not available in production
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
logger := e2e.NewTestLogger(t)
|
||||
cfg := ipfs.Config{
|
||||
ClusterAPIURL: e2e.GetIPFSClusterURL(),
|
||||
Timeout: 30 * time.Second,
|
||||
}
|
||||
|
||||
client, err := ipfs.NewClient(cfg, logger)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create IPFS client: %v", err)
|
||||
}
|
||||
|
||||
// Add file
|
||||
content := []byte("IPFS replication test content")
|
||||
addResult, err := client.Add(ctx, bytes.NewReader(content), "replication-test.txt")
|
||||
if err != nil {
|
||||
t.Fatalf("add file failed: %v", err)
|
||||
}
|
||||
|
||||
cid := addResult.Cid
|
||||
|
||||
// Pin with specific replication factor
|
||||
replicationFactor := 2
|
||||
pinResult, err := client.Pin(ctx, cid, "replication-test", replicationFactor)
|
||||
if err != nil {
|
||||
t.Fatalf("pin file failed: %v", err)
|
||||
}
|
||||
|
||||
if pinResult.Cid != cid {
|
||||
t.Fatalf("expected cid %s, got %s", cid, pinResult.Cid)
|
||||
}
|
||||
|
||||
// Give time for replication
|
||||
e2e.Delay(2000)
|
||||
|
||||
// Check status
|
||||
status, err := client.PinStatus(ctx, cid)
|
||||
if err != nil {
|
||||
t.Fatalf("get pin status failed: %v", err)
|
||||
}
|
||||
|
||||
t.Logf("Replication factor: requested=%d, actual=%d, peers=%d", replicationFactor, status.ReplicationFactor, len(status.Peers))
|
||||
}
|
||||
|
||||
func TestIPFSCluster_MultipleFiles(t *testing.T) {
|
||||
e2e.SkipIfProduction(t) // Direct IPFS connection not available in production
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
logger := e2e.NewTestLogger(t)
|
||||
cfg := ipfs.Config{
|
||||
ClusterAPIURL: e2e.GetIPFSClusterURL(),
|
||||
Timeout: 30 * time.Second,
|
||||
}
|
||||
|
||||
client, err := ipfs.NewClient(cfg, logger)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create IPFS client: %v", err)
|
||||
}
|
||||
|
||||
// Add multiple files
|
||||
numFiles := 5
|
||||
var cids []string
|
||||
|
||||
for i := 0; i < numFiles; i++ {
|
||||
content := []byte(fmt.Sprintf("File %d", i))
|
||||
result, err := client.Add(ctx, bytes.NewReader(content), fmt.Sprintf("file%d.txt", i))
|
||||
if err != nil {
|
||||
t.Fatalf("add file %d failed: %v", i, err)
|
||||
}
|
||||
cids = append(cids, result.Cid)
|
||||
}
|
||||
|
||||
if len(cids) != numFiles {
|
||||
t.Fatalf("expected %d files added, got %d", numFiles, len(cids))
|
||||
}
|
||||
|
||||
// Verify all files exist
|
||||
for i, cid := range cids {
|
||||
status, err := client.PinStatus(ctx, cid)
|
||||
if err != nil {
|
||||
t.Logf("warning: failed to get status for file %d: %v", i, err)
|
||||
continue
|
||||
}
|
||||
|
||||
if status.Cid != cid {
|
||||
t.Fatalf("expected cid %s, got %s", cid, status.Cid)
|
||||
}
|
||||
}
|
||||
|
||||
t.Logf("Successfully added and verified %d files", numFiles)
|
||||
}
|
||||
@ -1,296 +0,0 @@
|
||||
//go:build e2e
|
||||
|
||||
package cluster_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/e2e"
|
||||
)
|
||||
|
||||
func TestLibP2P_PeerConnectivity(t *testing.T) {
|
||||
e2e.SkipIfMissingGateway(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// Create and connect client
|
||||
c := e2e.NewNetworkClient(t)
|
||||
if err := c.Connect(); err != nil {
|
||||
t.Fatalf("connect failed: %v", err)
|
||||
}
|
||||
defer c.Disconnect()
|
||||
|
||||
// Verify peer connectivity through the gateway
|
||||
req := &e2e.HTTPRequest{
|
||||
Method: http.MethodGet,
|
||||
URL: e2e.GetGatewayURL() + "/v1/network/peers",
|
||||
}
|
||||
|
||||
body, status, err := req.Do(ctx)
|
||||
if err != nil {
|
||||
t.Fatalf("peers request failed: %v", err)
|
||||
}
|
||||
|
||||
if status != http.StatusOK {
|
||||
t.Fatalf("expected status 200, got %d", status)
|
||||
}
|
||||
|
||||
var resp map[string]interface{}
|
||||
if err := e2e.DecodeJSON(body, &resp); err != nil {
|
||||
t.Fatalf("failed to decode response: %v", err)
|
||||
}
|
||||
|
||||
peers := resp["peers"].([]interface{})
|
||||
if len(peers) == 0 {
|
||||
t.Logf("warning: no peers connected (cluster may still be initializing)")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLibP2P_BootstrapPeers(t *testing.T) {
|
||||
e2e.SkipIfMissingGateway(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
bootstrapPeers := e2e.GetBootstrapPeers()
|
||||
if len(bootstrapPeers) == 0 {
|
||||
t.Skipf("E2E_BOOTSTRAP_PEERS not set; skipping")
|
||||
}
|
||||
|
||||
// Create client with bootstrap peers explicitly set
|
||||
c := e2e.NewNetworkClient(t)
|
||||
if err := c.Connect(); err != nil {
|
||||
t.Fatalf("connect failed: %v", err)
|
||||
}
|
||||
defer c.Disconnect()
|
||||
|
||||
// Give peer discovery time
|
||||
e2e.Delay(2000)
|
||||
|
||||
// Verify we're connected (check via gateway status)
|
||||
req := &e2e.HTTPRequest{
|
||||
Method: http.MethodGet,
|
||||
URL: e2e.GetGatewayURL() + "/v1/network/status",
|
||||
}
|
||||
|
||||
body, status, err := req.Do(ctx)
|
||||
if err != nil {
|
||||
t.Fatalf("status request failed: %v", err)
|
||||
}
|
||||
|
||||
if status != http.StatusOK {
|
||||
t.Fatalf("expected status 200, got %d", status)
|
||||
}
|
||||
|
||||
var resp map[string]interface{}
|
||||
if err := e2e.DecodeJSON(body, &resp); err != nil {
|
||||
t.Fatalf("failed to decode response: %v", err)
|
||||
}
|
||||
|
||||
if resp["connected"] != true {
|
||||
t.Logf("warning: client not connected to network (cluster may still be initializing)")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLibP2P_MultipleClientConnections(t *testing.T) {
|
||||
e2e.SkipIfMissingGateway(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// Create multiple clients
|
||||
c1 := e2e.NewNetworkClient(t)
|
||||
c2 := e2e.NewNetworkClient(t)
|
||||
c3 := e2e.NewNetworkClient(t)
|
||||
|
||||
if err := c1.Connect(); err != nil {
|
||||
t.Fatalf("c1 connect failed: %v", err)
|
||||
}
|
||||
defer c1.Disconnect()
|
||||
|
||||
if err := c2.Connect(); err != nil {
|
||||
t.Fatalf("c2 connect failed: %v", err)
|
||||
}
|
||||
defer c2.Disconnect()
|
||||
|
||||
if err := c3.Connect(); err != nil {
|
||||
t.Fatalf("c3 connect failed: %v", err)
|
||||
}
|
||||
defer c3.Disconnect()
|
||||
|
||||
// Give peer discovery time
|
||||
e2e.Delay(2000)
|
||||
|
||||
// Verify gateway sees multiple peers
|
||||
req := &e2e.HTTPRequest{
|
||||
Method: http.MethodGet,
|
||||
URL: e2e.GetGatewayURL() + "/v1/network/peers",
|
||||
}
|
||||
|
||||
body, status, err := req.Do(ctx)
|
||||
if err != nil {
|
||||
t.Fatalf("peers request failed: %v", err)
|
||||
}
|
||||
|
||||
if status != http.StatusOK {
|
||||
t.Fatalf("expected status 200, got %d", status)
|
||||
}
|
||||
|
||||
var resp map[string]interface{}
|
||||
if err := e2e.DecodeJSON(body, &resp); err != nil {
|
||||
t.Fatalf("failed to decode response: %v", err)
|
||||
}
|
||||
|
||||
peers := resp["peers"].([]interface{})
|
||||
if len(peers) < 1 {
|
||||
t.Logf("warning: expected at least 1 peer, got %d", len(peers))
|
||||
}
|
||||
}
|
||||
|
||||
func TestLibP2P_ReconnectAfterDisconnect(t *testing.T) {
|
||||
e2e.SkipIfMissingGateway(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
c := e2e.NewNetworkClient(t)
|
||||
|
||||
// Connect
|
||||
if err := c.Connect(); err != nil {
|
||||
t.Fatalf("connect failed: %v", err)
|
||||
}
|
||||
|
||||
// Verify connected via gateway
|
||||
req1 := &e2e.HTTPRequest{
|
||||
Method: http.MethodGet,
|
||||
URL: e2e.GetGatewayURL() + "/v1/network/status",
|
||||
}
|
||||
|
||||
_, status1, err := req1.Do(ctx)
|
||||
if err != nil || status1 != http.StatusOK {
|
||||
t.Logf("warning: gateway check failed before disconnect: status %d, err %v", status1, err)
|
||||
}
|
||||
|
||||
// Disconnect
|
||||
if err := c.Disconnect(); err != nil {
|
||||
t.Logf("warning: disconnect failed: %v", err)
|
||||
}
|
||||
|
||||
// Give time for disconnect to propagate
|
||||
e2e.Delay(500)
|
||||
|
||||
// Reconnect
|
||||
if err := c.Connect(); err != nil {
|
||||
t.Fatalf("reconnect failed: %v", err)
|
||||
}
|
||||
defer c.Disconnect()
|
||||
|
||||
// Verify connected via gateway again
|
||||
req2 := &e2e.HTTPRequest{
|
||||
Method: http.MethodGet,
|
||||
URL: e2e.GetGatewayURL() + "/v1/network/status",
|
||||
}
|
||||
|
||||
_, status2, err := req2.Do(ctx)
|
||||
if err != nil || status2 != http.StatusOK {
|
||||
t.Logf("warning: gateway check failed after reconnect: status %d, err %v", status2, err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLibP2P_PeerDiscovery(t *testing.T) {
|
||||
e2e.SkipIfMissingGateway(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// Create client
|
||||
c := e2e.NewNetworkClient(t)
|
||||
if err := c.Connect(); err != nil {
|
||||
t.Fatalf("connect failed: %v", err)
|
||||
}
|
||||
defer c.Disconnect()
|
||||
|
||||
// Give peer discovery time
|
||||
e2e.Delay(3000)
|
||||
|
||||
// Get peer list
|
||||
req := &e2e.HTTPRequest{
|
||||
Method: http.MethodGet,
|
||||
URL: e2e.GetGatewayURL() + "/v1/network/peers",
|
||||
}
|
||||
|
||||
body, status, err := req.Do(ctx)
|
||||
if err != nil {
|
||||
t.Fatalf("peers request failed: %v", err)
|
||||
}
|
||||
|
||||
if status != http.StatusOK {
|
||||
t.Fatalf("expected status 200, got %d", status)
|
||||
}
|
||||
|
||||
var resp map[string]interface{}
|
||||
if err := e2e.DecodeJSON(body, &resp); err != nil {
|
||||
t.Fatalf("failed to decode response: %v", err)
|
||||
}
|
||||
|
||||
peers := resp["peers"].([]interface{})
|
||||
if len(peers) == 0 {
|
||||
t.Logf("warning: no peers discovered (cluster may not have multiple nodes)")
|
||||
} else {
|
||||
// Verify peer format (should be multiaddr strings)
|
||||
for _, p := range peers {
|
||||
peerStr := p.(string)
|
||||
if !strings.Contains(peerStr, "/p2p/") && !strings.Contains(peerStr, "/ipfs/") {
|
||||
t.Logf("warning: unexpected peer format: %s", peerStr)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestLibP2P_PeerAddressFormat(t *testing.T) {
|
||||
e2e.SkipIfMissingGateway(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// Create client
|
||||
c := e2e.NewNetworkClient(t)
|
||||
if err := c.Connect(); err != nil {
|
||||
t.Fatalf("connect failed: %v", err)
|
||||
}
|
||||
defer c.Disconnect()
|
||||
|
||||
// Get peer list
|
||||
req := &e2e.HTTPRequest{
|
||||
Method: http.MethodGet,
|
||||
URL: e2e.GetGatewayURL() + "/v1/network/peers",
|
||||
}
|
||||
|
||||
body, status, err := req.Do(ctx)
|
||||
if err != nil {
|
||||
t.Fatalf("peers request failed: %v", err)
|
||||
}
|
||||
|
||||
if status != http.StatusOK {
|
||||
t.Fatalf("expected status 200, got %d", status)
|
||||
}
|
||||
|
||||
var resp map[string]interface{}
|
||||
if err := e2e.DecodeJSON(body, &resp); err != nil {
|
||||
t.Fatalf("failed to decode response: %v", err)
|
||||
}
|
||||
|
||||
peers := resp["peers"].([]interface{})
|
||||
for _, p := range peers {
|
||||
peerStr := p.(string)
|
||||
// Multiaddrs should start with /
|
||||
if !strings.HasPrefix(peerStr, "/") {
|
||||
t.Fatalf("expected multiaddr format, got %s", peerStr)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,338 +0,0 @@
|
||||
//go:build e2e
|
||||
|
||||
package cluster_test
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/e2e"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// =============================================================================
|
||||
// STRICT OLRIC CACHE DISTRIBUTION TESTS
|
||||
// These tests verify that Olric cache data is properly distributed across nodes.
|
||||
// Tests FAIL if distribution doesn't work - no skips, no warnings.
|
||||
// =============================================================================
|
||||
|
||||
// getOlricNodeAddresses returns HTTP addresses of Olric nodes
|
||||
// Note: Olric HTTP port is typically on port 3320 for the main cluster
|
||||
func getOlricNodeAddresses() []string {
|
||||
// In dev mode, we have a single Olric instance
|
||||
// In production, each node runs its own Olric instance
|
||||
return []string{
|
||||
"http://localhost:3320",
|
||||
}
|
||||
}
|
||||
|
||||
// TestOlric_BasicDistribution verifies cache operations work across the cluster.
|
||||
func TestOlric_BasicDistribution(t *testing.T) {
|
||||
// Note: Not using SkipIfMissingGateway() since LoadTestEnv() creates its own API key
|
||||
env, err := e2e.LoadTestEnv()
|
||||
require.NoError(t, err, "FAIL: Could not load test environment")
|
||||
require.NotEmpty(t, env.APIKey, "FAIL: No API key available")
|
||||
|
||||
dmap := fmt.Sprintf("dist_test_%d", time.Now().UnixNano())
|
||||
|
||||
t.Run("Put_and_get_from_same_gateway", func(t *testing.T) {
|
||||
key := fmt.Sprintf("key_%d", time.Now().UnixNano())
|
||||
value := fmt.Sprintf("value_%d", time.Now().UnixNano())
|
||||
|
||||
// Put
|
||||
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
|
||||
require.NoError(t, err, "FAIL: Could not put value to cache")
|
||||
|
||||
// Get
|
||||
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
|
||||
require.NoError(t, err, "FAIL: Could not get value from cache")
|
||||
require.Equal(t, value, retrieved, "FAIL: Retrieved value doesn't match")
|
||||
|
||||
t.Logf(" ✓ Put/Get works: %s = %s", key, value)
|
||||
})
|
||||
|
||||
t.Run("Multiple_keys_distributed", func(t *testing.T) {
|
||||
// Put multiple keys (should be distributed across partitions)
|
||||
keys := make(map[string]string)
|
||||
for i := 0; i < 20; i++ {
|
||||
key := fmt.Sprintf("dist_key_%d_%d", i, time.Now().UnixNano())
|
||||
value := fmt.Sprintf("dist_value_%d", i)
|
||||
keys[key] = value
|
||||
|
||||
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
|
||||
require.NoError(t, err, "FAIL: Could not put key %s", key)
|
||||
}
|
||||
|
||||
t.Logf(" Put 20 keys to cache")
|
||||
|
||||
// Verify all keys are retrievable
|
||||
for key, expectedValue := range keys {
|
||||
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
|
||||
require.NoError(t, err, "FAIL: Could not get key %s", key)
|
||||
require.Equal(t, expectedValue, retrieved, "FAIL: Value mismatch for key %s", key)
|
||||
}
|
||||
|
||||
t.Logf(" ✓ All 20 keys are retrievable")
|
||||
})
|
||||
}
|
||||
|
||||
// TestOlric_ConcurrentAccess verifies cache handles concurrent operations correctly.
|
||||
func TestOlric_ConcurrentAccess(t *testing.T) {
|
||||
env, err := e2e.LoadTestEnv()
|
||||
require.NoError(t, err, "FAIL: Could not load test environment")
|
||||
|
||||
dmap := fmt.Sprintf("concurrent_test_%d", time.Now().UnixNano())
|
||||
|
||||
t.Run("Concurrent_writes_to_same_key", func(t *testing.T) {
|
||||
key := fmt.Sprintf("concurrent_key_%d", time.Now().UnixNano())
|
||||
|
||||
// Launch multiple goroutines writing to the same key
|
||||
done := make(chan error, 10)
|
||||
for i := 0; i < 10; i++ {
|
||||
go func(idx int) {
|
||||
value := fmt.Sprintf("concurrent_value_%d", idx)
|
||||
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
|
||||
done <- err
|
||||
}(i)
|
||||
}
|
||||
|
||||
// Wait for all writes
|
||||
var errors []error
|
||||
for i := 0; i < 10; i++ {
|
||||
if err := <-done; err != nil {
|
||||
errors = append(errors, err)
|
||||
}
|
||||
}
|
||||
|
||||
require.Empty(t, errors, "FAIL: %d concurrent writes failed: %v", len(errors), errors)
|
||||
|
||||
// The key should have ONE of the values (last write wins)
|
||||
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
|
||||
require.NoError(t, err, "FAIL: Could not get key after concurrent writes")
|
||||
require.Contains(t, retrieved, "concurrent_value_", "FAIL: Value doesn't match expected pattern")
|
||||
|
||||
t.Logf(" ✓ Concurrent writes succeeded, final value: %s", retrieved)
|
||||
})
|
||||
|
||||
t.Run("Concurrent_reads_and_writes", func(t *testing.T) {
|
||||
key := fmt.Sprintf("rw_key_%d", time.Now().UnixNano())
|
||||
initialValue := "initial_value"
|
||||
|
||||
// Set initial value
|
||||
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, initialValue)
|
||||
require.NoError(t, err, "FAIL: Could not set initial value")
|
||||
|
||||
// Launch concurrent readers and writers
|
||||
done := make(chan error, 20)
|
||||
|
||||
// 10 readers
|
||||
for i := 0; i < 10; i++ {
|
||||
go func() {
|
||||
_, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
|
||||
done <- err
|
||||
}()
|
||||
}
|
||||
|
||||
// 10 writers
|
||||
for i := 0; i < 10; i++ {
|
||||
go func(idx int) {
|
||||
value := fmt.Sprintf("updated_value_%d", idx)
|
||||
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
|
||||
done <- err
|
||||
}(i)
|
||||
}
|
||||
|
||||
// Wait for all operations
|
||||
var readErrors, writeErrors []error
|
||||
for i := 0; i < 20; i++ {
|
||||
if err := <-done; err != nil {
|
||||
if i < 10 {
|
||||
readErrors = append(readErrors, err)
|
||||
} else {
|
||||
writeErrors = append(writeErrors, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
require.Empty(t, readErrors, "FAIL: %d reads failed", len(readErrors))
|
||||
require.Empty(t, writeErrors, "FAIL: %d writes failed", len(writeErrors))
|
||||
|
||||
t.Logf(" ✓ Concurrent read/write operations succeeded")
|
||||
})
|
||||
}
|
||||
|
||||
// TestOlric_NamespaceClusterCache verifies cache works in namespace-specific clusters.
|
||||
func TestOlric_NamespaceClusterCache(t *testing.T) {
|
||||
// Create a new namespace
|
||||
namespace := fmt.Sprintf("cache-test-%d", time.Now().UnixNano())
|
||||
|
||||
env, err := e2e.LoadTestEnvWithNamespace(namespace)
|
||||
require.NoError(t, err, "FAIL: Could not create namespace for cache test")
|
||||
require.NotEmpty(t, env.APIKey, "FAIL: No API key")
|
||||
|
||||
t.Logf("Created namespace %s", namespace)
|
||||
|
||||
dmap := fmt.Sprintf("ns_cache_%d", time.Now().UnixNano())
|
||||
|
||||
t.Run("Cache_operations_work_in_namespace", func(t *testing.T) {
|
||||
key := fmt.Sprintf("ns_key_%d", time.Now().UnixNano())
|
||||
value := fmt.Sprintf("ns_value_%d", time.Now().UnixNano())
|
||||
|
||||
// Put using namespace API key
|
||||
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
|
||||
require.NoError(t, err, "FAIL: Could not put value in namespace cache")
|
||||
|
||||
// Get
|
||||
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
|
||||
require.NoError(t, err, "FAIL: Could not get value from namespace cache")
|
||||
require.Equal(t, value, retrieved, "FAIL: Value mismatch in namespace cache")
|
||||
|
||||
t.Logf(" ✓ Namespace cache operations work: %s = %s", key, value)
|
||||
})
|
||||
|
||||
// Check if namespace Olric instances are running (port 10003 offset in port blocks)
|
||||
var nsOlricPorts []int
|
||||
for port := 10003; port <= 10098; port += 5 {
|
||||
conn, err := net.DialTimeout("tcp", fmt.Sprintf("localhost:%d", port), 1*time.Second)
|
||||
if err == nil {
|
||||
conn.Close()
|
||||
nsOlricPorts = append(nsOlricPorts, port)
|
||||
}
|
||||
}
|
||||
|
||||
if len(nsOlricPorts) > 0 {
|
||||
t.Logf("Found %d namespace Olric memberlist ports: %v", len(nsOlricPorts), nsOlricPorts)
|
||||
|
||||
t.Run("Namespace_Olric_nodes_connected", func(t *testing.T) {
|
||||
// Verify all namespace Olric nodes can be reached
|
||||
for _, port := range nsOlricPorts {
|
||||
conn, err := net.DialTimeout("tcp", fmt.Sprintf("localhost:%d", port), 2*time.Second)
|
||||
require.NoError(t, err, "FAIL: Cannot connect to namespace Olric on port %d", port)
|
||||
conn.Close()
|
||||
t.Logf(" ✓ Namespace Olric memberlist on port %d is reachable", port)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestOlric_DataConsistency verifies data remains consistent across operations.
|
||||
func TestOlric_DataConsistency(t *testing.T) {
|
||||
env, err := e2e.LoadTestEnv()
|
||||
require.NoError(t, err, "FAIL: Could not load test environment")
|
||||
|
||||
dmap := fmt.Sprintf("consistency_test_%d", time.Now().UnixNano())
|
||||
|
||||
t.Run("Update_preserves_latest_value", func(t *testing.T) {
|
||||
key := fmt.Sprintf("update_key_%d", time.Now().UnixNano())
|
||||
|
||||
// Write multiple times
|
||||
for i := 1; i <= 5; i++ {
|
||||
value := fmt.Sprintf("version_%d", i)
|
||||
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
|
||||
require.NoError(t, err, "FAIL: Could not update key to version %d", i)
|
||||
}
|
||||
|
||||
// Final read should return latest version
|
||||
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
|
||||
require.NoError(t, err, "FAIL: Could not read final value")
|
||||
require.Equal(t, "version_5", retrieved, "FAIL: Latest version not preserved")
|
||||
|
||||
t.Logf(" ✓ Latest value preserved after 5 updates")
|
||||
})
|
||||
|
||||
t.Run("Delete_removes_key", func(t *testing.T) {
|
||||
key := fmt.Sprintf("delete_key_%d", time.Now().UnixNano())
|
||||
value := "to_be_deleted"
|
||||
|
||||
// Put
|
||||
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
|
||||
require.NoError(t, err, "FAIL: Could not put value")
|
||||
|
||||
// Verify it exists
|
||||
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
|
||||
require.NoError(t, err, "FAIL: Could not get value before delete")
|
||||
require.Equal(t, value, retrieved)
|
||||
|
||||
// Delete (POST with JSON body)
|
||||
deleteBody := map[string]interface{}{
|
||||
"dmap": dmap,
|
||||
"key": key,
|
||||
}
|
||||
deleteBytes, _ := json.Marshal(deleteBody)
|
||||
req, _ := http.NewRequest("POST", env.GatewayURL+"/v1/cache/delete", strings.NewReader(string(deleteBytes)))
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", "Bearer "+env.APIKey)
|
||||
client := &http.Client{Timeout: 10 * time.Second}
|
||||
resp, err := client.Do(req)
|
||||
require.NoError(t, err, "FAIL: Delete request failed")
|
||||
resp.Body.Close()
|
||||
require.True(t, resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusNoContent,
|
||||
"FAIL: Delete returned unexpected status %d", resp.StatusCode)
|
||||
|
||||
// Verify key is gone
|
||||
_, err = e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
|
||||
require.Error(t, err, "FAIL: Key should not exist after delete")
|
||||
require.Contains(t, err.Error(), "not found", "FAIL: Expected 'not found' error")
|
||||
|
||||
t.Logf(" ✓ Delete properly removes key")
|
||||
})
|
||||
}
|
||||
|
||||
// TestOlric_TTLExpiration verifies TTL expiration works.
|
||||
// NOTE: TTL is currently parsed but not applied by the cache handler (TODO in set_handler.go).
|
||||
// This test is skipped until TTL support is fully implemented.
|
||||
func TestOlric_TTLExpiration(t *testing.T) {
|
||||
t.Skip("TTL support not yet implemented in cache handler - see set_handler.go lines 88-98")
|
||||
|
||||
env, err := e2e.LoadTestEnv()
|
||||
require.NoError(t, err, "FAIL: Could not load test environment")
|
||||
|
||||
dmap := fmt.Sprintf("ttl_test_%d", time.Now().UnixNano())
|
||||
|
||||
t.Run("Key_expires_after_TTL", func(t *testing.T) {
|
||||
key := fmt.Sprintf("ttl_key_%d", time.Now().UnixNano())
|
||||
value := "expires_soon"
|
||||
ttlSeconds := 3
|
||||
|
||||
// Put with TTL (TTL is a duration string like "3s", "1m", etc.)
|
||||
reqBody := map[string]interface{}{
|
||||
"dmap": dmap,
|
||||
"key": key,
|
||||
"value": value,
|
||||
"ttl": fmt.Sprintf("%ds", ttlSeconds),
|
||||
}
|
||||
bodyBytes, _ := json.Marshal(reqBody)
|
||||
|
||||
req, _ := http.NewRequest("POST", env.GatewayURL+"/v1/cache/put", strings.NewReader(string(bodyBytes)))
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", "Bearer "+env.APIKey)
|
||||
|
||||
client := &http.Client{Timeout: 10 * time.Second}
|
||||
resp, err := client.Do(req)
|
||||
require.NoError(t, err, "FAIL: Put with TTL failed")
|
||||
resp.Body.Close()
|
||||
require.True(t, resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusCreated,
|
||||
"FAIL: Put returned status %d", resp.StatusCode)
|
||||
|
||||
// Verify key exists immediately
|
||||
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
|
||||
require.NoError(t, err, "FAIL: Could not get key immediately after put")
|
||||
require.Equal(t, value, retrieved)
|
||||
t.Logf(" Key exists immediately after put")
|
||||
|
||||
// Wait for TTL to expire (plus buffer)
|
||||
time.Sleep(time.Duration(ttlSeconds+2) * time.Second)
|
||||
|
||||
// Key should be gone
|
||||
_, err = e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
|
||||
require.Error(t, err, "FAIL: Key should have expired after %d seconds", ttlSeconds)
|
||||
require.Contains(t, err.Error(), "not found", "FAIL: Expected 'not found' error after TTL")
|
||||
|
||||
t.Logf(" ✓ Key expired after %d seconds as expected", ttlSeconds)
|
||||
})
|
||||
}
|
||||
@ -1,479 +0,0 @@
|
||||
//go:build e2e
|
||||
|
||||
package cluster_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/e2e"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// =============================================================================
|
||||
// STRICT RQLITE CLUSTER TESTS
|
||||
// These tests verify that RQLite cluster operations work correctly.
|
||||
// Tests FAIL if operations don't work - no skips, no warnings.
|
||||
// =============================================================================
|
||||
|
||||
// TestRQLite_ClusterHealth verifies the RQLite cluster is healthy and operational.
|
||||
func TestRQLite_ClusterHealth(t *testing.T) {
|
||||
e2e.SkipIfMissingGateway(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// Check RQLite schema endpoint (proves cluster is reachable)
|
||||
req := &e2e.HTTPRequest{
|
||||
Method: http.MethodGet,
|
||||
URL: e2e.GetGatewayURL() + "/v1/rqlite/schema",
|
||||
}
|
||||
|
||||
body, status, err := req.Do(ctx)
|
||||
require.NoError(t, err, "FAIL: Could not reach RQLite cluster")
|
||||
require.Equal(t, http.StatusOK, status, "FAIL: RQLite schema endpoint returned %d: %s", status, string(body))
|
||||
|
||||
var schemaResp map[string]interface{}
|
||||
err = e2e.DecodeJSON(body, &schemaResp)
|
||||
require.NoError(t, err, "FAIL: Could not decode RQLite schema response")
|
||||
|
||||
// Schema endpoint should return tables array
|
||||
_, hasTables := schemaResp["tables"]
|
||||
require.True(t, hasTables, "FAIL: RQLite schema response missing 'tables' field")
|
||||
|
||||
t.Logf(" ✓ RQLite cluster is healthy and responding")
|
||||
}
|
||||
|
||||
// TestRQLite_WriteReadConsistency verifies data written can be read back consistently.
|
||||
func TestRQLite_WriteReadConsistency(t *testing.T) {
|
||||
e2e.SkipIfMissingGateway(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
||||
defer cancel()
|
||||
|
||||
table := e2e.GenerateTableName()
|
||||
|
||||
// Cleanup
|
||||
defer func() {
|
||||
dropReq := &e2e.HTTPRequest{
|
||||
Method: http.MethodPost,
|
||||
URL: e2e.GetGatewayURL() + "/v1/rqlite/drop-table",
|
||||
Body: map[string]interface{}{"table": table},
|
||||
}
|
||||
dropReq.Do(context.Background())
|
||||
}()
|
||||
|
||||
// Create table
|
||||
createReq := &e2e.HTTPRequest{
|
||||
Method: http.MethodPost,
|
||||
URL: e2e.GetGatewayURL() + "/v1/rqlite/create-table",
|
||||
Body: map[string]interface{}{
|
||||
"schema": fmt.Sprintf(
|
||||
"CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY, value TEXT, created_at DATETIME DEFAULT CURRENT_TIMESTAMP)",
|
||||
table,
|
||||
),
|
||||
},
|
||||
}
|
||||
|
||||
_, status, err := createReq.Do(ctx)
|
||||
require.NoError(t, err, "FAIL: Create table request failed")
|
||||
require.True(t, status == http.StatusCreated || status == http.StatusOK,
|
||||
"FAIL: Create table returned status %d", status)
|
||||
t.Logf("Created table %s", table)
|
||||
|
||||
t.Run("Write_then_read_returns_same_data", func(t *testing.T) {
|
||||
uniqueValue := fmt.Sprintf("test_value_%d", time.Now().UnixNano())
|
||||
|
||||
// Insert
|
||||
insertReq := &e2e.HTTPRequest{
|
||||
Method: http.MethodPost,
|
||||
URL: e2e.GetGatewayURL() + "/v1/rqlite/transaction",
|
||||
Body: map[string]interface{}{
|
||||
"statements": []string{
|
||||
fmt.Sprintf("INSERT INTO %s (value) VALUES ('%s')", table, uniqueValue),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
_, status, err := insertReq.Do(ctx)
|
||||
require.NoError(t, err, "FAIL: Insert request failed")
|
||||
require.Equal(t, http.StatusOK, status, "FAIL: Insert returned status %d", status)
|
||||
|
||||
// Read back
|
||||
queryReq := &e2e.HTTPRequest{
|
||||
Method: http.MethodPost,
|
||||
URL: e2e.GetGatewayURL() + "/v1/rqlite/query",
|
||||
Body: map[string]interface{}{
|
||||
"sql": fmt.Sprintf("SELECT value FROM %s WHERE value = '%s'", table, uniqueValue),
|
||||
},
|
||||
}
|
||||
|
||||
body, status, err := queryReq.Do(ctx)
|
||||
require.NoError(t, err, "FAIL: Query request failed")
|
||||
require.Equal(t, http.StatusOK, status, "FAIL: Query returned status %d", status)
|
||||
|
||||
var queryResp map[string]interface{}
|
||||
err = e2e.DecodeJSON(body, &queryResp)
|
||||
require.NoError(t, err, "FAIL: Could not decode query response")
|
||||
|
||||
// Verify we got our value back
|
||||
count, ok := queryResp["count"].(float64)
|
||||
require.True(t, ok, "FAIL: Response missing 'count' field")
|
||||
require.Equal(t, float64(1), count, "FAIL: Expected 1 row, got %v", count)
|
||||
|
||||
t.Logf(" ✓ Written value '%s' was read back correctly", uniqueValue)
|
||||
})
|
||||
|
||||
t.Run("Multiple_writes_all_readable", func(t *testing.T) {
|
||||
// Insert multiple values
|
||||
var statements []string
|
||||
for i := 0; i < 10; i++ {
|
||||
statements = append(statements,
|
||||
fmt.Sprintf("INSERT INTO %s (value) VALUES ('batch_%d')", table, i))
|
||||
}
|
||||
|
||||
insertReq := &e2e.HTTPRequest{
|
||||
Method: http.MethodPost,
|
||||
URL: e2e.GetGatewayURL() + "/v1/rqlite/transaction",
|
||||
Body: map[string]interface{}{
|
||||
"statements": statements,
|
||||
},
|
||||
}
|
||||
|
||||
_, status, err := insertReq.Do(ctx)
|
||||
require.NoError(t, err, "FAIL: Batch insert failed")
|
||||
require.Equal(t, http.StatusOK, status, "FAIL: Batch insert returned status %d", status)
|
||||
|
||||
// Count all batch rows
|
||||
queryReq := &e2e.HTTPRequest{
|
||||
Method: http.MethodPost,
|
||||
URL: e2e.GetGatewayURL() + "/v1/rqlite/query",
|
||||
Body: map[string]interface{}{
|
||||
"sql": fmt.Sprintf("SELECT COUNT(*) as cnt FROM %s WHERE value LIKE 'batch_%%'", table),
|
||||
},
|
||||
}
|
||||
|
||||
body, status, err := queryReq.Do(ctx)
|
||||
require.NoError(t, err, "FAIL: Count query failed")
|
||||
require.Equal(t, http.StatusOK, status, "FAIL: Count query returned status %d", status)
|
||||
|
||||
var queryResp map[string]interface{}
|
||||
e2e.DecodeJSON(body, &queryResp)
|
||||
|
||||
if rows, ok := queryResp["rows"].([]interface{}); ok && len(rows) > 0 {
|
||||
row := rows[0].([]interface{})
|
||||
count := int(row[0].(float64))
|
||||
require.Equal(t, 10, count, "FAIL: Expected 10 batch rows, got %d", count)
|
||||
}
|
||||
|
||||
t.Logf(" ✓ All 10 batch writes are readable")
|
||||
})
|
||||
}
|
||||
|
||||
// TestRQLite_TransactionAtomicity verifies transactions are atomic.
|
||||
func TestRQLite_TransactionAtomicity(t *testing.T) {
|
||||
e2e.SkipIfMissingGateway(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
||||
defer cancel()
|
||||
|
||||
table := e2e.GenerateTableName()
|
||||
|
||||
// Cleanup
|
||||
defer func() {
|
||||
dropReq := &e2e.HTTPRequest{
|
||||
Method: http.MethodPost,
|
||||
URL: e2e.GetGatewayURL() + "/v1/rqlite/drop-table",
|
||||
Body: map[string]interface{}{"table": table},
|
||||
}
|
||||
dropReq.Do(context.Background())
|
||||
}()
|
||||
|
||||
// Create table
|
||||
createReq := &e2e.HTTPRequest{
|
||||
Method: http.MethodPost,
|
||||
URL: e2e.GetGatewayURL() + "/v1/rqlite/create-table",
|
||||
Body: map[string]interface{}{
|
||||
"schema": fmt.Sprintf(
|
||||
"CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY, value TEXT UNIQUE)",
|
||||
table,
|
||||
),
|
||||
},
|
||||
}
|
||||
|
||||
_, status, err := createReq.Do(ctx)
|
||||
require.NoError(t, err, "FAIL: Create table failed")
|
||||
require.True(t, status == http.StatusCreated || status == http.StatusOK,
|
||||
"FAIL: Create table returned status %d", status)
|
||||
|
||||
t.Run("Successful_transaction_commits_all", func(t *testing.T) {
|
||||
txReq := &e2e.HTTPRequest{
|
||||
Method: http.MethodPost,
|
||||
URL: e2e.GetGatewayURL() + "/v1/rqlite/transaction",
|
||||
Body: map[string]interface{}{
|
||||
"statements": []string{
|
||||
fmt.Sprintf("INSERT INTO %s (value) VALUES ('tx_val_1')", table),
|
||||
fmt.Sprintf("INSERT INTO %s (value) VALUES ('tx_val_2')", table),
|
||||
fmt.Sprintf("INSERT INTO %s (value) VALUES ('tx_val_3')", table),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
_, status, err := txReq.Do(ctx)
|
||||
require.NoError(t, err, "FAIL: Transaction request failed")
|
||||
require.Equal(t, http.StatusOK, status, "FAIL: Transaction returned status %d", status)
|
||||
|
||||
// Verify all 3 rows exist
|
||||
queryReq := &e2e.HTTPRequest{
|
||||
Method: http.MethodPost,
|
||||
URL: e2e.GetGatewayURL() + "/v1/rqlite/query",
|
||||
Body: map[string]interface{}{
|
||||
"sql": fmt.Sprintf("SELECT COUNT(*) FROM %s WHERE value LIKE 'tx_val_%%'", table),
|
||||
},
|
||||
}
|
||||
|
||||
body, _, _ := queryReq.Do(ctx)
|
||||
var queryResp map[string]interface{}
|
||||
e2e.DecodeJSON(body, &queryResp)
|
||||
|
||||
if rows, ok := queryResp["rows"].([]interface{}); ok && len(rows) > 0 {
|
||||
row := rows[0].([]interface{})
|
||||
count := int(row[0].(float64))
|
||||
require.Equal(t, 3, count, "FAIL: Transaction didn't commit all 3 rows - got %d", count)
|
||||
}
|
||||
|
||||
t.Logf(" ✓ Transaction committed all 3 rows atomically")
|
||||
})
|
||||
|
||||
t.Run("Updates_preserve_consistency", func(t *testing.T) {
|
||||
// Update a value
|
||||
updateReq := &e2e.HTTPRequest{
|
||||
Method: http.MethodPost,
|
||||
URL: e2e.GetGatewayURL() + "/v1/rqlite/transaction",
|
||||
Body: map[string]interface{}{
|
||||
"statements": []string{
|
||||
fmt.Sprintf("UPDATE %s SET value = 'tx_val_1_updated' WHERE value = 'tx_val_1'", table),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
_, status, err := updateReq.Do(ctx)
|
||||
require.NoError(t, err, "FAIL: Update request failed")
|
||||
require.Equal(t, http.StatusOK, status, "FAIL: Update returned status %d", status)
|
||||
|
||||
// Verify update took effect
|
||||
queryReq := &e2e.HTTPRequest{
|
||||
Method: http.MethodPost,
|
||||
URL: e2e.GetGatewayURL() + "/v1/rqlite/query",
|
||||
Body: map[string]interface{}{
|
||||
"sql": fmt.Sprintf("SELECT value FROM %s WHERE value = 'tx_val_1_updated'", table),
|
||||
},
|
||||
}
|
||||
|
||||
body, _, _ := queryReq.Do(ctx)
|
||||
var queryResp map[string]interface{}
|
||||
e2e.DecodeJSON(body, &queryResp)
|
||||
|
||||
count, _ := queryResp["count"].(float64)
|
||||
require.Equal(t, float64(1), count, "FAIL: Update didn't take effect")
|
||||
|
||||
t.Logf(" ✓ Update preserved consistency")
|
||||
})
|
||||
}
|
||||
|
||||
// TestRQLite_ConcurrentWrites verifies the cluster handles concurrent writes correctly.
|
||||
func TestRQLite_ConcurrentWrites(t *testing.T) {
|
||||
e2e.SkipIfMissingGateway(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
table := e2e.GenerateTableName()
|
||||
|
||||
// Cleanup
|
||||
defer func() {
|
||||
dropReq := &e2e.HTTPRequest{
|
||||
Method: http.MethodPost,
|
||||
URL: e2e.GetGatewayURL() + "/v1/rqlite/drop-table",
|
||||
Body: map[string]interface{}{"table": table},
|
||||
}
|
||||
dropReq.Do(context.Background())
|
||||
}()
|
||||
|
||||
// Create table
|
||||
createReq := &e2e.HTTPRequest{
|
||||
Method: http.MethodPost,
|
||||
URL: e2e.GetGatewayURL() + "/v1/rqlite/create-table",
|
||||
Body: map[string]interface{}{
|
||||
"schema": fmt.Sprintf(
|
||||
"CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY, worker INTEGER, seq INTEGER)",
|
||||
table,
|
||||
),
|
||||
},
|
||||
}
|
||||
|
||||
_, status, err := createReq.Do(ctx)
|
||||
require.NoError(t, err, "FAIL: Create table failed")
|
||||
require.True(t, status == http.StatusCreated || status == http.StatusOK,
|
||||
"FAIL: Create table returned status %d", status)
|
||||
|
||||
t.Run("Concurrent_inserts_all_succeed", func(t *testing.T) {
|
||||
numWorkers := 5
|
||||
insertsPerWorker := 10
|
||||
expectedTotal := numWorkers * insertsPerWorker
|
||||
|
||||
var wg sync.WaitGroup
|
||||
errChan := make(chan error, numWorkers*insertsPerWorker)
|
||||
|
||||
for w := 0; w < numWorkers; w++ {
|
||||
wg.Add(1)
|
||||
go func(workerID int) {
|
||||
defer wg.Done()
|
||||
for i := 0; i < insertsPerWorker; i++ {
|
||||
insertReq := &e2e.HTTPRequest{
|
||||
Method: http.MethodPost,
|
||||
URL: e2e.GetGatewayURL() + "/v1/rqlite/transaction",
|
||||
Body: map[string]interface{}{
|
||||
"statements": []string{
|
||||
fmt.Sprintf("INSERT INTO %s (worker, seq) VALUES (%d, %d)", table, workerID, i),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
_, status, err := insertReq.Do(ctx)
|
||||
if err != nil {
|
||||
errChan <- fmt.Errorf("worker %d insert %d failed: %w", workerID, i, err)
|
||||
return
|
||||
}
|
||||
if status != http.StatusOK {
|
||||
errChan <- fmt.Errorf("worker %d insert %d got status %d", workerID, i, status)
|
||||
return
|
||||
}
|
||||
}
|
||||
}(w)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
close(errChan)
|
||||
|
||||
// Collect errors
|
||||
var errors []error
|
||||
for err := range errChan {
|
||||
errors = append(errors, err)
|
||||
}
|
||||
require.Empty(t, errors, "FAIL: %d concurrent inserts failed: %v", len(errors), errors)
|
||||
|
||||
// Verify total count
|
||||
queryReq := &e2e.HTTPRequest{
|
||||
Method: http.MethodPost,
|
||||
URL: e2e.GetGatewayURL() + "/v1/rqlite/query",
|
||||
Body: map[string]interface{}{
|
||||
"sql": fmt.Sprintf("SELECT COUNT(*) FROM %s", table),
|
||||
},
|
||||
}
|
||||
|
||||
body, _, _ := queryReq.Do(ctx)
|
||||
var queryResp map[string]interface{}
|
||||
e2e.DecodeJSON(body, &queryResp)
|
||||
|
||||
if rows, ok := queryResp["rows"].([]interface{}); ok && len(rows) > 0 {
|
||||
row := rows[0].([]interface{})
|
||||
count := int(row[0].(float64))
|
||||
require.Equal(t, expectedTotal, count,
|
||||
"FAIL: Expected %d total rows from concurrent inserts, got %d", expectedTotal, count)
|
||||
}
|
||||
|
||||
t.Logf(" ✓ All %d concurrent inserts succeeded", expectedTotal)
|
||||
})
|
||||
}
|
||||
|
||||
// TestRQLite_NamespaceClusterOperations verifies RQLite works in namespace clusters.
|
||||
func TestRQLite_NamespaceClusterOperations(t *testing.T) {
|
||||
// Create a new namespace
|
||||
namespace := fmt.Sprintf("rqlite-test-%d", time.Now().UnixNano())
|
||||
|
||||
env, err := e2e.LoadTestEnvWithNamespace(namespace)
|
||||
require.NoError(t, err, "FAIL: Could not create namespace for RQLite test")
|
||||
require.NotEmpty(t, env.APIKey, "FAIL: No API key - namespace provisioning failed")
|
||||
|
||||
t.Logf("Created namespace %s", namespace)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
||||
defer cancel()
|
||||
|
||||
table := e2e.GenerateTableName()
|
||||
|
||||
// Cleanup
|
||||
defer func() {
|
||||
dropReq := &e2e.HTTPRequest{
|
||||
Method: http.MethodPost,
|
||||
URL: env.GatewayURL + "/v1/rqlite/drop-table",
|
||||
Body: map[string]interface{}{"table": table},
|
||||
Headers: map[string]string{"Authorization": "Bearer " + env.APIKey},
|
||||
}
|
||||
dropReq.Do(context.Background())
|
||||
}()
|
||||
|
||||
t.Run("Namespace_RQLite_create_insert_query", func(t *testing.T) {
|
||||
// Create table in namespace cluster
|
||||
createReq := &e2e.HTTPRequest{
|
||||
Method: http.MethodPost,
|
||||
URL: env.GatewayURL + "/v1/rqlite/create-table",
|
||||
Headers: map[string]string{"Authorization": "Bearer " + env.APIKey},
|
||||
Body: map[string]interface{}{
|
||||
"schema": fmt.Sprintf(
|
||||
"CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY, value TEXT)",
|
||||
table,
|
||||
),
|
||||
},
|
||||
}
|
||||
|
||||
_, status, err := createReq.Do(ctx)
|
||||
require.NoError(t, err, "FAIL: Create table in namespace failed")
|
||||
require.True(t, status == http.StatusCreated || status == http.StatusOK,
|
||||
"FAIL: Create table returned status %d", status)
|
||||
|
||||
// Insert data
|
||||
uniqueValue := fmt.Sprintf("ns_value_%d", time.Now().UnixNano())
|
||||
insertReq := &e2e.HTTPRequest{
|
||||
Method: http.MethodPost,
|
||||
URL: env.GatewayURL + "/v1/rqlite/transaction",
|
||||
Headers: map[string]string{"Authorization": "Bearer " + env.APIKey},
|
||||
Body: map[string]interface{}{
|
||||
"statements": []string{
|
||||
fmt.Sprintf("INSERT INTO %s (value) VALUES ('%s')", table, uniqueValue),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
_, status, err = insertReq.Do(ctx)
|
||||
require.NoError(t, err, "FAIL: Insert in namespace failed")
|
||||
require.Equal(t, http.StatusOK, status, "FAIL: Insert returned status %d", status)
|
||||
|
||||
// Query data
|
||||
queryReq := &e2e.HTTPRequest{
|
||||
Method: http.MethodPost,
|
||||
URL: env.GatewayURL + "/v1/rqlite/query",
|
||||
Headers: map[string]string{"Authorization": "Bearer " + env.APIKey},
|
||||
Body: map[string]interface{}{
|
||||
"sql": fmt.Sprintf("SELECT value FROM %s WHERE value = '%s'", table, uniqueValue),
|
||||
},
|
||||
}
|
||||
|
||||
body, status, err := queryReq.Do(ctx)
|
||||
require.NoError(t, err, "FAIL: Query in namespace failed")
|
||||
require.Equal(t, http.StatusOK, status, "FAIL: Query returned status %d", status)
|
||||
|
||||
var queryResp map[string]interface{}
|
||||
e2e.DecodeJSON(body, &queryResp)
|
||||
|
||||
count, _ := queryResp["count"].(float64)
|
||||
require.Equal(t, float64(1), count, "FAIL: Data not found in namespace cluster")
|
||||
|
||||
t.Logf(" ✓ Namespace RQLite operations work correctly")
|
||||
})
|
||||
}
|
||||
@ -478,11 +478,6 @@ func GetAPIKey() string {
|
||||
return apiKey
|
||||
}
|
||||
|
||||
// GetJWT returns the gateway JWT token (currently not auto-discovered)
|
||||
func GetJWT() string {
|
||||
return ""
|
||||
}
|
||||
|
||||
// GetBootstrapPeers returns bootstrap peer addresses from config
|
||||
func GetBootstrapPeers() []string {
|
||||
cacheMutex.RLock()
|
||||
@ -748,10 +743,6 @@ func NewNetworkClient(t *testing.T) client.NetworkClient {
|
||||
cfg.APIKey = GetAPIKey()
|
||||
cfg.QuietMode = true // Suppress debug logs in tests
|
||||
|
||||
if jwt := GetJWT(); jwt != "" {
|
||||
cfg.JWT = jwt
|
||||
}
|
||||
|
||||
if peers := GetBootstrapPeers(); len(peers) > 0 {
|
||||
cfg.BootstrapPeers = peers
|
||||
}
|
||||
|
||||
@ -1,333 +0,0 @@
|
||||
//go:build e2e && production
|
||||
|
||||
package production
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/e2e"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// TestDNS_MultipleARecords verifies that deploying with replicas creates
|
||||
// multiple A records (one per node) for DNS round-robin.
|
||||
func TestDNS_MultipleARecords(t *testing.T) {
|
||||
e2e.SkipIfLocal(t)
|
||||
|
||||
env, err := e2e.LoadTestEnv()
|
||||
require.NoError(t, err)
|
||||
|
||||
if len(env.Config.Servers) < 2 {
|
||||
t.Skip("Requires at least 2 servers")
|
||||
}
|
||||
|
||||
deploymentName := fmt.Sprintf("dns-multi-%d", time.Now().Unix())
|
||||
tarballPath := filepath.Join("../../testdata/apps/react-app")
|
||||
|
||||
deploymentID := e2e.CreateTestDeployment(t, env, deploymentName, tarballPath)
|
||||
require.NotEmpty(t, deploymentID)
|
||||
|
||||
defer func() {
|
||||
if !env.SkipCleanup {
|
||||
e2e.DeleteDeployment(t, env, deploymentID)
|
||||
}
|
||||
}()
|
||||
|
||||
// Wait for replica setup and DNS propagation
|
||||
time.Sleep(15 * time.Second)
|
||||
|
||||
t.Run("DNS returns multiple IPs", func(t *testing.T) {
|
||||
deployment := e2e.GetDeployment(t, env, deploymentID)
|
||||
subdomain, _ := deployment["subdomain"].(string)
|
||||
if subdomain == "" {
|
||||
subdomain = deploymentName
|
||||
}
|
||||
fqdn := fmt.Sprintf("%s.%s", subdomain, env.BaseDomain)
|
||||
|
||||
// Query nameserver directly
|
||||
nameserverIP := env.Config.Servers[0].IP
|
||||
resolver := &net.Resolver{
|
||||
PreferGo: true,
|
||||
Dial: func(ctx context.Context, network, address string) (net.Conn, error) {
|
||||
d := net.Dialer{Timeout: 10 * time.Second}
|
||||
return d.Dial("udp", nameserverIP+":53")
|
||||
},
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
ips, err := resolver.LookupHost(ctx, fqdn)
|
||||
if err != nil {
|
||||
t.Logf("DNS lookup failed for %s: %v", fqdn, err)
|
||||
t.Log("Trying net.LookupHost instead...")
|
||||
ips, err = net.LookupHost(fqdn)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
t.Logf("DNS lookup failed: %v (DNS may not be propagated yet)", err)
|
||||
t.Skip("DNS not yet propagated")
|
||||
}
|
||||
|
||||
t.Logf("DNS returned %d IPs for %s: %v", len(ips), fqdn, ips)
|
||||
assert.GreaterOrEqual(t, len(ips), 2,
|
||||
"Should have at least 2 A records (home + replica)")
|
||||
|
||||
// Verify returned IPs are from our server list
|
||||
serverIPs := e2e.GetServerIPs(env.Config)
|
||||
for _, ip := range ips {
|
||||
assert.Contains(t, serverIPs, ip,
|
||||
"DNS IP %s should be one of our servers", ip)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// TestDNS_CleanupOnDelete verifies that deleting a deployment removes all
|
||||
// DNS records (both home and replica A records).
|
||||
func TestDNS_CleanupOnDelete(t *testing.T) {
|
||||
e2e.SkipIfLocal(t)
|
||||
|
||||
env, err := e2e.LoadTestEnv()
|
||||
require.NoError(t, err)
|
||||
|
||||
deploymentName := fmt.Sprintf("dns-cleanup-%d", time.Now().Unix())
|
||||
tarballPath := filepath.Join("../../testdata/apps/react-app")
|
||||
|
||||
deploymentID := e2e.CreateTestDeployment(t, env, deploymentName, tarballPath)
|
||||
require.NotEmpty(t, deploymentID)
|
||||
|
||||
// Wait for DNS
|
||||
time.Sleep(10 * time.Second)
|
||||
|
||||
// Get subdomain before deletion
|
||||
deployment := e2e.GetDeployment(t, env, deploymentID)
|
||||
subdomain, _ := deployment["subdomain"].(string)
|
||||
if subdomain == "" {
|
||||
subdomain = deploymentName
|
||||
}
|
||||
fqdn := fmt.Sprintf("%s.%s", subdomain, env.BaseDomain)
|
||||
|
||||
// Verify DNS works before deletion
|
||||
t.Run("DNS resolves before deletion", func(t *testing.T) {
|
||||
nodeURL := extractNodeURLProd(t, deployment)
|
||||
if nodeURL == "" {
|
||||
t.Skip("No URL to test")
|
||||
}
|
||||
domain := extractDomainProd(nodeURL)
|
||||
|
||||
req, _ := http.NewRequest("GET", env.GatewayURL+"/", nil)
|
||||
req.Host = domain
|
||||
|
||||
resp, err := env.HTTPClient.Do(req)
|
||||
if err == nil {
|
||||
resp.Body.Close()
|
||||
t.Logf("Pre-delete: status=%d", resp.StatusCode)
|
||||
}
|
||||
})
|
||||
|
||||
// Delete
|
||||
e2e.DeleteDeployment(t, env, deploymentID)
|
||||
time.Sleep(10 * time.Second)
|
||||
|
||||
t.Run("DNS records removed after deletion", func(t *testing.T) {
|
||||
ips, err := net.LookupHost(fqdn)
|
||||
if err != nil {
|
||||
t.Logf("DNS lookup failed (expected): %v", err)
|
||||
return // Good — no records
|
||||
}
|
||||
|
||||
// If we still get IPs, they might be cached. Log and warn.
|
||||
if len(ips) > 0 {
|
||||
t.Logf("WARNING: DNS still returns %d IPs after deletion (may be cached): %v", len(ips), ips)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// TestDNS_CustomSubdomain verifies that deploying with a custom subdomain
|
||||
// creates DNS records using the custom name.
|
||||
func TestDNS_CustomSubdomain(t *testing.T) {
|
||||
e2e.SkipIfLocal(t)
|
||||
|
||||
env, err := e2e.LoadTestEnv()
|
||||
require.NoError(t, err)
|
||||
|
||||
deploymentName := fmt.Sprintf("dns-custom-%d", time.Now().Unix())
|
||||
tarballPath := filepath.Join("../../testdata/apps/react-app")
|
||||
|
||||
deploymentID := createDeploymentWithSubdomain(t, env, deploymentName, tarballPath)
|
||||
require.NotEmpty(t, deploymentID)
|
||||
|
||||
defer func() {
|
||||
if !env.SkipCleanup {
|
||||
e2e.DeleteDeployment(t, env, deploymentID)
|
||||
}
|
||||
}()
|
||||
|
||||
time.Sleep(10 * time.Second)
|
||||
|
||||
t.Run("Deployment has subdomain with random suffix", func(t *testing.T) {
|
||||
deployment := e2e.GetDeployment(t, env, deploymentID)
|
||||
subdomain, _ := deployment["subdomain"].(string)
|
||||
require.NotEmpty(t, subdomain, "Deployment should have a subdomain")
|
||||
t.Logf("Subdomain: %s", subdomain)
|
||||
|
||||
// Verify the subdomain starts with the deployment name
|
||||
assert.Contains(t, subdomain, deploymentName[:10],
|
||||
"Subdomain should relate to deployment name")
|
||||
})
|
||||
}
|
||||
|
||||
// TestDNS_RedeployPreservesSubdomain verifies that updating a deployment
|
||||
// does not change the subdomain/DNS.
|
||||
func TestDNS_RedeployPreservesSubdomain(t *testing.T) {
|
||||
e2e.SkipIfLocal(t)
|
||||
|
||||
env, err := e2e.LoadTestEnv()
|
||||
require.NoError(t, err)
|
||||
|
||||
deploymentName := fmt.Sprintf("dns-preserve-%d", time.Now().Unix())
|
||||
tarballPath := filepath.Join("../../testdata/apps/react-app")
|
||||
|
||||
deploymentID := e2e.CreateTestDeployment(t, env, deploymentName, tarballPath)
|
||||
require.NotEmpty(t, deploymentID)
|
||||
|
||||
defer func() {
|
||||
if !env.SkipCleanup {
|
||||
e2e.DeleteDeployment(t, env, deploymentID)
|
||||
}
|
||||
}()
|
||||
|
||||
time.Sleep(5 * time.Second)
|
||||
|
||||
// Get original subdomain
|
||||
deployment := e2e.GetDeployment(t, env, deploymentID)
|
||||
originalSubdomain, _ := deployment["subdomain"].(string)
|
||||
originalURLs := deployment["urls"]
|
||||
t.Logf("Original subdomain: %s, urls: %v", originalSubdomain, originalURLs)
|
||||
|
||||
// Update
|
||||
updateStaticDeploymentProd(t, env, deploymentName, tarballPath)
|
||||
time.Sleep(5 * time.Second)
|
||||
|
||||
// Verify subdomain unchanged
|
||||
t.Run("Subdomain unchanged after update", func(t *testing.T) {
|
||||
updated := e2e.GetDeployment(t, env, deploymentID)
|
||||
updatedSubdomain, _ := updated["subdomain"].(string)
|
||||
|
||||
assert.Equal(t, originalSubdomain, updatedSubdomain,
|
||||
"Subdomain should not change after update")
|
||||
t.Logf("After update: subdomain=%s", updatedSubdomain)
|
||||
})
|
||||
}
|
||||
|
||||
func createDeploymentWithSubdomain(t *testing.T, env *e2e.E2ETestEnv, name, tarballPath string) string {
|
||||
t.Helper()
|
||||
|
||||
var fileData []byte
|
||||
info, err := os.Stat(tarballPath)
|
||||
require.NoError(t, err)
|
||||
if info.IsDir() {
|
||||
fileData, err = exec.Command("tar", "-czf", "-", "-C", tarballPath, ".").Output()
|
||||
require.NoError(t, err)
|
||||
} else {
|
||||
file, err := os.Open(tarballPath)
|
||||
require.NoError(t, err)
|
||||
defer file.Close()
|
||||
fileData, _ = io.ReadAll(file)
|
||||
}
|
||||
|
||||
body := &bytes.Buffer{}
|
||||
boundary := "----WebKitFormBoundary7MA4YWxkTrZu0gW"
|
||||
|
||||
body.WriteString("--" + boundary + "\r\n")
|
||||
body.WriteString("Content-Disposition: form-data; name=\"name\"\r\n\r\n")
|
||||
body.WriteString(name + "\r\n")
|
||||
|
||||
body.WriteString("--" + boundary + "\r\n")
|
||||
body.WriteString("Content-Disposition: form-data; name=\"tarball\"; filename=\"app.tar.gz\"\r\n")
|
||||
body.WriteString("Content-Type: application/gzip\r\n\r\n")
|
||||
|
||||
body.Write(fileData)
|
||||
body.WriteString("\r\n--" + boundary + "--\r\n")
|
||||
|
||||
req, err := http.NewRequest("POST", env.GatewayURL+"/v1/deployments/static/upload", body)
|
||||
require.NoError(t, err)
|
||||
req.Header.Set("Content-Type", "multipart/form-data; boundary="+boundary)
|
||||
req.Header.Set("Authorization", "Bearer "+env.APIKey)
|
||||
|
||||
resp, err := env.HTTPClient.Do(req)
|
||||
require.NoError(t, err)
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusCreated {
|
||||
bodyBytes, _ := io.ReadAll(resp.Body)
|
||||
t.Fatalf("Upload failed: status=%d body=%s", resp.StatusCode, string(bodyBytes))
|
||||
}
|
||||
|
||||
var result map[string]interface{}
|
||||
json.NewDecoder(resp.Body).Decode(&result)
|
||||
|
||||
if id, ok := result["deployment_id"].(string); ok {
|
||||
return id
|
||||
}
|
||||
if id, ok := result["id"].(string); ok {
|
||||
return id
|
||||
}
|
||||
t.Fatalf("No id in response: %+v", result)
|
||||
return ""
|
||||
}
|
||||
|
||||
func updateStaticDeploymentProd(t *testing.T, env *e2e.E2ETestEnv, name, tarballPath string) {
|
||||
t.Helper()
|
||||
|
||||
var fileData []byte
|
||||
info, err := os.Stat(tarballPath)
|
||||
require.NoError(t, err)
|
||||
if info.IsDir() {
|
||||
fileData, err = exec.Command("tar", "-czf", "-", "-C", tarballPath, ".").Output()
|
||||
require.NoError(t, err)
|
||||
} else {
|
||||
file, err := os.Open(tarballPath)
|
||||
require.NoError(t, err)
|
||||
defer file.Close()
|
||||
fileData, _ = io.ReadAll(file)
|
||||
}
|
||||
|
||||
body := &bytes.Buffer{}
|
||||
boundary := "----WebKitFormBoundary7MA4YWxkTrZu0gW"
|
||||
|
||||
body.WriteString("--" + boundary + "\r\n")
|
||||
body.WriteString("Content-Disposition: form-data; name=\"name\"\r\n\r\n")
|
||||
body.WriteString(name + "\r\n")
|
||||
|
||||
body.WriteString("--" + boundary + "\r\n")
|
||||
body.WriteString("Content-Disposition: form-data; name=\"tarball\"; filename=\"app.tar.gz\"\r\n")
|
||||
body.WriteString("Content-Type: application/gzip\r\n\r\n")
|
||||
|
||||
body.Write(fileData)
|
||||
body.WriteString("\r\n--" + boundary + "--\r\n")
|
||||
|
||||
req, err := http.NewRequest("POST", env.GatewayURL+"/v1/deployments/static/update", body)
|
||||
require.NoError(t, err)
|
||||
req.Header.Set("Content-Type", "multipart/form-data; boundary="+boundary)
|
||||
req.Header.Set("Authorization", "Bearer "+env.APIKey)
|
||||
|
||||
resp, err := env.HTTPClient.Do(req)
|
||||
require.NoError(t, err)
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
bodyBytes, _ := io.ReadAll(resp.Body)
|
||||
t.Fatalf("Update failed: status=%d body=%s", resp.StatusCode, string(bodyBytes))
|
||||
}
|
||||
}
|
||||
@ -1,121 +0,0 @@
|
||||
//go:build e2e && production
|
||||
|
||||
package production
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/e2e"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// TestDNS_DeploymentResolution tests that deployed applications are resolvable via DNS
|
||||
// This test requires production mode as it performs real DNS lookups
|
||||
func TestDNS_DeploymentResolution(t *testing.T) {
|
||||
e2e.SkipIfLocal(t)
|
||||
|
||||
env, err := e2e.LoadTestEnv()
|
||||
require.NoError(t, err, "Failed to load test environment")
|
||||
|
||||
deploymentName := fmt.Sprintf("dns-test-%d", time.Now().Unix())
|
||||
tarballPath := filepath.Join("../../testdata/apps/react-app")
|
||||
|
||||
deploymentID := e2e.CreateTestDeployment(t, env, deploymentName, tarballPath)
|
||||
defer func() {
|
||||
if !env.SkipCleanup {
|
||||
e2e.DeleteDeployment(t, env, deploymentID)
|
||||
}
|
||||
}()
|
||||
|
||||
// Wait for DNS propagation
|
||||
domain := env.BuildDeploymentDomain(deploymentName)
|
||||
t.Logf("Testing DNS resolution for: %s", domain)
|
||||
|
||||
t.Run("DNS resolves to valid server IP", func(t *testing.T) {
|
||||
// Allow some time for DNS propagation
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
var ips []string
|
||||
var err error
|
||||
|
||||
// Poll for DNS resolution
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
t.Fatalf("DNS resolution timeout for %s", domain)
|
||||
default:
|
||||
ips, err = net.LookupHost(domain)
|
||||
if err == nil && len(ips) > 0 {
|
||||
goto resolved
|
||||
}
|
||||
time.Sleep(2 * time.Second)
|
||||
}
|
||||
}
|
||||
|
||||
resolved:
|
||||
t.Logf("DNS resolved: %s -> %v", domain, ips)
|
||||
assert.NotEmpty(t, ips, "Should have IP addresses")
|
||||
|
||||
// Verify resolved IP is one of our servers
|
||||
validIPs := e2e.GetServerIPs(env.Config)
|
||||
if len(validIPs) > 0 {
|
||||
found := false
|
||||
for _, ip := range ips {
|
||||
for _, validIP := range validIPs {
|
||||
if ip == validIP {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
assert.True(t, found, "Resolved IP should be one of our servers: %v (valid: %v)", ips, validIPs)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// TestDNS_BaseDomainResolution tests that the base domain resolves correctly
|
||||
func TestDNS_BaseDomainResolution(t *testing.T) {
|
||||
e2e.SkipIfLocal(t)
|
||||
|
||||
env, err := e2e.LoadTestEnv()
|
||||
require.NoError(t, err, "Failed to load test environment")
|
||||
|
||||
t.Run("Base domain resolves", func(t *testing.T) {
|
||||
ips, err := net.LookupHost(env.BaseDomain)
|
||||
require.NoError(t, err, "Base domain %s should resolve", env.BaseDomain)
|
||||
assert.NotEmpty(t, ips, "Should have IP addresses")
|
||||
|
||||
t.Logf("✓ Base domain %s resolves to: %v", env.BaseDomain, ips)
|
||||
})
|
||||
}
|
||||
|
||||
// TestDNS_WildcardResolution tests wildcard DNS for arbitrary subdomains
|
||||
func TestDNS_WildcardResolution(t *testing.T) {
|
||||
e2e.SkipIfLocal(t)
|
||||
|
||||
env, err := e2e.LoadTestEnv()
|
||||
require.NoError(t, err, "Failed to load test environment")
|
||||
|
||||
t.Run("Wildcard subdomain resolves", func(t *testing.T) {
|
||||
// Test with a random subdomain that doesn't exist as a deployment
|
||||
randomSubdomain := fmt.Sprintf("random-test-%d.%s", time.Now().UnixNano(), env.BaseDomain)
|
||||
|
||||
ips, err := net.LookupHost(randomSubdomain)
|
||||
if err != nil {
|
||||
// DNS may not support wildcard - that's OK for some setups
|
||||
t.Logf("⚠ Wildcard DNS not configured (this may be expected): %v", err)
|
||||
t.Skip("Wildcard DNS not configured")
|
||||
return
|
||||
}
|
||||
|
||||
assert.NotEmpty(t, ips, "Wildcard subdomain should resolve")
|
||||
t.Logf("✓ Wildcard subdomain resolves: %s -> %v", randomSubdomain, ips)
|
||||
})
|
||||
}
|
||||
@ -1,181 +0,0 @@
|
||||
//go:build e2e && production
|
||||
|
||||
package production
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/e2e"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// TestNameserver_NSRecords tests that NS records are properly configured for the domain
|
||||
func TestNameserver_NSRecords(t *testing.T) {
|
||||
e2e.SkipIfLocal(t)
|
||||
|
||||
env, err := e2e.LoadTestEnv()
|
||||
require.NoError(t, err, "Failed to load test environment")
|
||||
|
||||
if len(env.Config.Nameservers) == 0 {
|
||||
t.Skip("No nameservers configured in e2e/config.yaml")
|
||||
}
|
||||
|
||||
t.Run("NS records exist for base domain", func(t *testing.T) {
|
||||
nsRecords, err := net.LookupNS(env.BaseDomain)
|
||||
require.NoError(t, err, "Should be able to look up NS records for %s", env.BaseDomain)
|
||||
require.NotEmpty(t, nsRecords, "Should have NS records")
|
||||
|
||||
t.Logf("Found %d NS records for %s:", len(nsRecords), env.BaseDomain)
|
||||
for _, ns := range nsRecords {
|
||||
t.Logf(" - %s", ns.Host)
|
||||
}
|
||||
|
||||
// Verify our nameservers are listed
|
||||
for _, expected := range env.Config.Nameservers {
|
||||
found := false
|
||||
for _, ns := range nsRecords {
|
||||
// Trim trailing dot for comparison
|
||||
nsHost := strings.TrimSuffix(ns.Host, ".")
|
||||
if nsHost == expected || nsHost == expected+"." {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
assert.True(t, found, "NS records should include %s", expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// TestNameserver_GlueRecords tests that glue records point to correct IPs
|
||||
func TestNameserver_GlueRecords(t *testing.T) {
|
||||
e2e.SkipIfLocal(t)
|
||||
|
||||
env, err := e2e.LoadTestEnv()
|
||||
require.NoError(t, err, "Failed to load test environment")
|
||||
|
||||
if len(env.Config.Nameservers) == 0 {
|
||||
t.Skip("No nameservers configured in e2e/config.yaml")
|
||||
}
|
||||
|
||||
nameserverServers := e2e.GetNameserverServers(env.Config)
|
||||
if len(nameserverServers) == 0 {
|
||||
t.Skip("No servers marked as nameservers in config")
|
||||
}
|
||||
|
||||
t.Run("Glue records resolve to correct IPs", func(t *testing.T) {
|
||||
for i, ns := range env.Config.Nameservers {
|
||||
ips, err := net.LookupHost(ns)
|
||||
require.NoError(t, err, "Nameserver %s should resolve", ns)
|
||||
require.NotEmpty(t, ips, "Nameserver %s should have IP addresses", ns)
|
||||
|
||||
t.Logf("Nameserver %s resolves to: %v", ns, ips)
|
||||
|
||||
// If we have the expected IP, verify it matches
|
||||
if i < len(nameserverServers) {
|
||||
expectedIP := nameserverServers[i].IP
|
||||
found := false
|
||||
for _, ip := range ips {
|
||||
if ip == expectedIP {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
assert.True(t, found, "Glue record for %s should point to %s (got %v)", ns, expectedIP, ips)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// TestNameserver_CoreDNSResponds tests that our CoreDNS servers respond to queries
|
||||
func TestNameserver_CoreDNSResponds(t *testing.T) {
|
||||
e2e.SkipIfLocal(t)
|
||||
|
||||
env, err := e2e.LoadTestEnv()
|
||||
require.NoError(t, err, "Failed to load test environment")
|
||||
|
||||
nameserverServers := e2e.GetNameserverServers(env.Config)
|
||||
if len(nameserverServers) == 0 {
|
||||
t.Skip("No servers marked as nameservers in config")
|
||||
}
|
||||
|
||||
t.Run("CoreDNS servers respond to queries", func(t *testing.T) {
|
||||
for _, server := range nameserverServers {
|
||||
t.Run(server.Name, func(t *testing.T) {
|
||||
// Create a custom resolver that queries this specific server
|
||||
resolver := &net.Resolver{
|
||||
PreferGo: true,
|
||||
Dial: func(ctx context.Context, network, address string) (net.Conn, error) {
|
||||
d := net.Dialer{
|
||||
Timeout: 5 * time.Second,
|
||||
}
|
||||
return d.DialContext(ctx, "udp", server.IP+":53")
|
||||
},
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// Query the base domain
|
||||
ips, err := resolver.LookupHost(ctx, env.BaseDomain)
|
||||
if err != nil {
|
||||
// Log the error but don't fail - server might be configured differently
|
||||
t.Logf("⚠ CoreDNS at %s (%s) query error: %v", server.Name, server.IP, err)
|
||||
return
|
||||
}
|
||||
|
||||
t.Logf("✓ CoreDNS at %s (%s) responded: %s -> %v", server.Name, server.IP, env.BaseDomain, ips)
|
||||
assert.NotEmpty(t, ips, "CoreDNS should return IP addresses")
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// TestNameserver_QueryLatency tests DNS query latency from our nameservers
|
||||
func TestNameserver_QueryLatency(t *testing.T) {
|
||||
e2e.SkipIfLocal(t)
|
||||
|
||||
env, err := e2e.LoadTestEnv()
|
||||
require.NoError(t, err, "Failed to load test environment")
|
||||
|
||||
nameserverServers := e2e.GetNameserverServers(env.Config)
|
||||
if len(nameserverServers) == 0 {
|
||||
t.Skip("No servers marked as nameservers in config")
|
||||
}
|
||||
|
||||
t.Run("DNS query latency is acceptable", func(t *testing.T) {
|
||||
for _, server := range nameserverServers {
|
||||
resolver := &net.Resolver{
|
||||
PreferGo: true,
|
||||
Dial: func(ctx context.Context, network, address string) (net.Conn, error) {
|
||||
d := net.Dialer{
|
||||
Timeout: 5 * time.Second,
|
||||
}
|
||||
return d.DialContext(ctx, "udp", server.IP+":53")
|
||||
},
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
start := time.Now()
|
||||
_, err := resolver.LookupHost(ctx, env.BaseDomain)
|
||||
latency := time.Since(start)
|
||||
|
||||
if err != nil {
|
||||
t.Logf("⚠ Query to %s failed: %v", server.Name, err)
|
||||
continue
|
||||
}
|
||||
|
||||
t.Logf("DNS latency from %s (%s): %v", server.Name, server.IP, latency)
|
||||
|
||||
// DNS queries should be fast (under 500ms is reasonable)
|
||||
assert.Less(t, latency, 500*time.Millisecond,
|
||||
"DNS query to %s should complete in under 500ms", server.Name)
|
||||
}
|
||||
})
|
||||
}
|
||||
158
pkg/cli/inspect_command.go
Normal file
158
pkg/cli/inspect_command.go
Normal file
@ -0,0 +1,158 @@
|
||||
package cli
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/inspector"
|
||||
// Import checks package so init() registers the checkers
|
||||
_ "github.com/DeBrosOfficial/network/pkg/inspector/checks"
|
||||
)
|
||||
|
||||
// loadDotEnv loads key=value pairs from a .env file into os environment.
|
||||
// Only sets vars that are not already set (env takes precedence over file).
|
||||
func loadDotEnv(path string) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return // .env is optional
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
if line == "" || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
eq := strings.IndexByte(line, '=')
|
||||
if eq < 1 {
|
||||
continue
|
||||
}
|
||||
key := line[:eq]
|
||||
value := line[eq+1:]
|
||||
// Only set if not already in environment
|
||||
if os.Getenv(key) == "" {
|
||||
os.Setenv(key, value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// HandleInspectCommand handles the "orama inspect" command.
|
||||
func HandleInspectCommand(args []string) {
|
||||
// Load .env file from current directory (only sets unset vars)
|
||||
loadDotEnv(".env")
|
||||
|
||||
fs := flag.NewFlagSet("inspect", flag.ExitOnError)
|
||||
|
||||
configPath := fs.String("config", "scripts/remote-nodes.conf", "Path to remote-nodes.conf")
|
||||
env := fs.String("env", "", "Environment to inspect (devnet, testnet)")
|
||||
subsystem := fs.String("subsystem", "all", "Subsystem to inspect (rqlite,olric,ipfs,dns,wg,system,network,all)")
|
||||
format := fs.String("format", "table", "Output format (table, json)")
|
||||
timeout := fs.Duration("timeout", 30*time.Second, "SSH command timeout")
|
||||
verbose := fs.Bool("verbose", false, "Verbose output")
|
||||
// AI flags
|
||||
aiEnabled := fs.Bool("ai", false, "Enable AI analysis of failures")
|
||||
aiModel := fs.String("model", "moonshotai/kimi-k2.5", "OpenRouter model for AI analysis")
|
||||
aiAPIKey := fs.String("api-key", "", "OpenRouter API key (or OPENROUTER_API_KEY env)")
|
||||
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintf(os.Stderr, "Usage: orama inspect [flags]\n\n")
|
||||
fmt.Fprintf(os.Stderr, "Inspect cluster health by SSHing into nodes and running checks.\n\n")
|
||||
fmt.Fprintf(os.Stderr, "Flags:\n")
|
||||
fs.PrintDefaults()
|
||||
fmt.Fprintf(os.Stderr, "\nExamples:\n")
|
||||
fmt.Fprintf(os.Stderr, " orama inspect --env devnet\n")
|
||||
fmt.Fprintf(os.Stderr, " orama inspect --env devnet --subsystem rqlite\n")
|
||||
fmt.Fprintf(os.Stderr, " orama inspect --env devnet --ai\n")
|
||||
fmt.Fprintf(os.Stderr, " orama inspect --env devnet --ai --model openai/gpt-4o\n")
|
||||
}
|
||||
|
||||
if err := fs.Parse(args); err != nil {
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
if *env == "" {
|
||||
fmt.Fprintf(os.Stderr, "Error: --env is required (devnet, testnet)\n")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Load nodes
|
||||
nodes, err := inspector.LoadNodes(*configPath)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error loading config: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Filter by environment
|
||||
nodes = inspector.FilterByEnv(nodes, *env)
|
||||
if len(nodes) == 0 {
|
||||
fmt.Fprintf(os.Stderr, "Error: no nodes found for environment %q\n", *env)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Parse subsystems
|
||||
var subsystems []string
|
||||
if *subsystem != "all" {
|
||||
subsystems = strings.Split(*subsystem, ",")
|
||||
}
|
||||
|
||||
fmt.Printf("Inspecting %d %s nodes", len(nodes), *env)
|
||||
if len(subsystems) > 0 {
|
||||
fmt.Printf(" [%s]", strings.Join(subsystems, ","))
|
||||
}
|
||||
if *aiEnabled {
|
||||
fmt.Printf(" (AI: %s)", *aiModel)
|
||||
}
|
||||
fmt.Printf("...\n\n")
|
||||
|
||||
// Phase 1: Collect
|
||||
ctx, cancel := context.WithTimeout(context.Background(), *timeout+10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if *verbose {
|
||||
fmt.Printf("Collecting data from %d nodes (timeout: %s)...\n", len(nodes), timeout)
|
||||
}
|
||||
|
||||
data := inspector.Collect(ctx, nodes, subsystems, *verbose)
|
||||
|
||||
if *verbose {
|
||||
fmt.Printf("Collection complete in %.1fs\n\n", data.Duration.Seconds())
|
||||
}
|
||||
|
||||
// Phase 2: Check
|
||||
results := inspector.RunChecks(data, subsystems)
|
||||
|
||||
// Phase 3: Report
|
||||
switch *format {
|
||||
case "json":
|
||||
inspector.PrintJSON(results, os.Stdout)
|
||||
default:
|
||||
inspector.PrintTable(results, os.Stdout)
|
||||
}
|
||||
|
||||
// Phase 4: AI Analysis (if enabled and there are failures or warnings)
|
||||
if *aiEnabled {
|
||||
issues := results.FailuresAndWarnings()
|
||||
if len(issues) == 0 {
|
||||
fmt.Printf("\nAll checks passed — no AI analysis needed.\n")
|
||||
} else {
|
||||
fmt.Printf("\nAnalyzing %d issues with %s...\n", len(issues), *aiModel)
|
||||
analysis, err := inspector.Analyze(results, data, *aiModel, *aiAPIKey)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "\nAI analysis failed: %v\n", err)
|
||||
} else {
|
||||
inspector.PrintAnalysis(analysis, os.Stdout)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Exit with non-zero if any failures
|
||||
if failures := results.Failures(); len(failures) > 0 {
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
@ -53,13 +53,17 @@ func HandleStop() {
|
||||
// Reset failed state for any services that might be in failed state
|
||||
resetArgs := []string{"reset-failed"}
|
||||
resetArgs = append(resetArgs, services...)
|
||||
exec.Command("systemctl", resetArgs...).Run()
|
||||
if err := exec.Command("systemctl", resetArgs...).Run(); err != nil {
|
||||
fmt.Printf(" ⚠️ Warning: Failed to reset-failed state: %v\n", err)
|
||||
}
|
||||
|
||||
// Wait again after reset-failed
|
||||
time.Sleep(1 * time.Second)
|
||||
|
||||
// Stop again to ensure they're stopped
|
||||
exec.Command("systemctl", stopArgs...).Run()
|
||||
if err := exec.Command("systemctl", stopArgs...).Run(); err != nil {
|
||||
fmt.Printf(" ⚠️ Warning: Second stop attempt had errors: %v\n", err)
|
||||
}
|
||||
time.Sleep(1 * time.Second)
|
||||
|
||||
hadError := false
|
||||
|
||||
@ -60,10 +60,6 @@ func ParseFlags(args []string) (*Flags, error) {
|
||||
fs.IntVar(&flags.AnyoneBandwidth, "anyone-bandwidth", 30, "Limit relay to N% of VPS bandwidth (0=unlimited, runs speedtest)")
|
||||
fs.IntVar(&flags.AnyoneAccounting, "anyone-accounting", 0, "Monthly data cap for relay in GB (0=unlimited)")
|
||||
|
||||
// Support legacy flags for backwards compatibility
|
||||
nightly := fs.Bool("nightly", false, "Use nightly branch (deprecated, use --branch nightly)")
|
||||
main := fs.Bool("main", false, "Use main branch (deprecated, use --branch main)")
|
||||
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return nil, err
|
||||
@ -71,14 +67,6 @@ func ParseFlags(args []string) (*Flags, error) {
|
||||
return nil, fmt.Errorf("failed to parse flags: %w", err)
|
||||
}
|
||||
|
||||
// Handle legacy flags
|
||||
if *nightly {
|
||||
flags.Branch = "nightly"
|
||||
}
|
||||
if *main {
|
||||
flags.Branch = "main"
|
||||
}
|
||||
|
||||
// Set nameserver if explicitly provided
|
||||
if *nameserver {
|
||||
flags.Nameserver = nameserver
|
||||
|
||||
@ -10,6 +10,8 @@ import (
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/constants"
|
||||
)
|
||||
|
||||
var ErrServiceNotFound = errors.New("service not found")
|
||||
@ -22,15 +24,15 @@ type PortSpec struct {
|
||||
|
||||
var ServicePorts = map[string][]PortSpec{
|
||||
"debros-gateway": {
|
||||
{Name: "Gateway API", Port: 6001},
|
||||
{Name: "Gateway API", Port: constants.GatewayAPIPort},
|
||||
},
|
||||
"debros-olric": {
|
||||
{Name: "Olric HTTP", Port: 3320},
|
||||
{Name: "Olric Memberlist", Port: 3322},
|
||||
{Name: "Olric HTTP", Port: constants.OlricHTTPPort},
|
||||
{Name: "Olric Memberlist", Port: constants.OlricMemberlistPort},
|
||||
},
|
||||
"debros-node": {
|
||||
{Name: "RQLite HTTP", Port: 5001},
|
||||
{Name: "RQLite Raft", Port: 7001},
|
||||
{Name: "RQLite HTTP", Port: constants.RQLiteHTTPPort},
|
||||
{Name: "RQLite Raft", Port: constants.RQLiteRaftPort},
|
||||
},
|
||||
"debros-ipfs": {
|
||||
{Name: "IPFS API", Port: 4501},
|
||||
@ -48,12 +50,12 @@ func DefaultPorts() []PortSpec {
|
||||
{Name: "IPFS Swarm", Port: 4001},
|
||||
{Name: "IPFS API", Port: 4501},
|
||||
{Name: "IPFS Gateway", Port: 8080},
|
||||
{Name: "Gateway API", Port: 6001},
|
||||
{Name: "RQLite HTTP", Port: 5001},
|
||||
{Name: "RQLite Raft", Port: 7001},
|
||||
{Name: "Gateway API", Port: constants.GatewayAPIPort},
|
||||
{Name: "RQLite HTTP", Port: constants.RQLiteHTTPPort},
|
||||
{Name: "RQLite Raft", Port: constants.RQLiteRaftPort},
|
||||
{Name: "IPFS Cluster API", Port: 9094},
|
||||
{Name: "Olric HTTP", Port: 3320},
|
||||
{Name: "Olric Memberlist", Port: 3322},
|
||||
{Name: "Olric HTTP", Port: constants.OlricHTTPPort},
|
||||
{Name: "Olric Memberlist", Port: constants.OlricMemberlistPort},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
9
pkg/constants/capacity.go
Normal file
9
pkg/constants/capacity.go
Normal file
@ -0,0 +1,9 @@
|
||||
package constants
|
||||
|
||||
// Node capacity limits used by both deployment and namespace scheduling.
|
||||
const (
|
||||
MaxDeploymentsPerNode = 100
|
||||
MaxMemoryMB = 8192 // 8GB
|
||||
MaxCPUPercent = 400 // 400% = 4 cores
|
||||
MaxPortsPerNode = 9900 // ~10k ports available
|
||||
)
|
||||
11
pkg/constants/ports.go
Normal file
11
pkg/constants/ports.go
Normal file
@ -0,0 +1,11 @@
|
||||
package constants
|
||||
|
||||
// Service ports used across the network.
|
||||
const (
|
||||
WireGuardPort = 51820
|
||||
RQLiteHTTPPort = 5001
|
||||
RQLiteRaftPort = 7001
|
||||
OlricHTTPPort = 3320
|
||||
OlricMemberlistPort = 3322
|
||||
GatewayAPIPort = 6001
|
||||
)
|
||||
@ -6,6 +6,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/client"
|
||||
"github.com/DeBrosOfficial/network/pkg/constants"
|
||||
"github.com/DeBrosOfficial/network/pkg/rqlite"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
@ -270,7 +271,7 @@ func (hnm *HomeNodeManager) getNodeCapacity(ctx context.Context, nodeID string)
|
||||
AllocatedPorts: allocatedPorts,
|
||||
AvailablePorts: availablePorts,
|
||||
UsedMemoryMB: totalMemoryMB,
|
||||
AvailableMemoryMB: 8192 - totalMemoryMB, // Assume 8GB per node (make configurable later)
|
||||
AvailableMemoryMB: constants.MaxMemoryMB - totalMemoryMB,
|
||||
UsedCPUPercent: totalCPUPercent,
|
||||
Score: score,
|
||||
}
|
||||
@ -331,12 +332,10 @@ func (hnm *HomeNodeManager) getNodeResourceUsage(ctx context.Context, nodeID str
|
||||
|
||||
// calculateCapacityScore calculates a 0.0-1.0 score (higher is better)
|
||||
func (hnm *HomeNodeManager) calculateCapacityScore(deploymentCount, allocatedPorts, availablePorts, usedMemoryMB, usedCPUPercent int) float64 {
|
||||
const (
|
||||
maxDeployments = 100 // Max deployments per node
|
||||
maxMemoryMB = 8192 // 8GB
|
||||
maxCPUPercent = 400 // 400% = 4 cores
|
||||
maxPorts = 9900 // ~10k ports available
|
||||
)
|
||||
maxDeployments := constants.MaxDeploymentsPerNode
|
||||
maxMemoryMB := constants.MaxMemoryMB
|
||||
maxCPUPercent := constants.MaxCPUPercent
|
||||
maxPorts := constants.MaxPortsPerNode
|
||||
|
||||
// Calculate individual component scores (0.0 to 1.0)
|
||||
deploymentScore := 1.0 - (float64(deploymentCount) / float64(maxDeployments))
|
||||
|
||||
@ -3,6 +3,7 @@ package deployments
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/client"
|
||||
@ -216,21 +217,6 @@ func isConflictError(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
// RQLite returns constraint violation errors as strings containing "UNIQUE constraint failed"
|
||||
errStr := err.Error()
|
||||
return contains(errStr, "UNIQUE") || contains(errStr, "constraint") || contains(errStr, "conflict")
|
||||
}
|
||||
|
||||
// contains checks if a string contains a substring (case-insensitive)
|
||||
func contains(s, substr string) bool {
|
||||
return len(s) >= len(substr) && (s == substr || len(s) > len(substr) && findSubstring(s, substr))
|
||||
}
|
||||
|
||||
func findSubstring(s, substr string) bool {
|
||||
for i := 0; i <= len(s)-len(substr); i++ {
|
||||
if s[i:i+len(substr)] == substr {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
return strings.Contains(errStr, "UNIQUE") || strings.Contains(errStr, "constraint") || strings.Contains(errStr, "conflict")
|
||||
}
|
||||
|
||||
@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"reflect"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/rqlite"
|
||||
@ -410,7 +411,7 @@ func TestContains(t *testing.T) {
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := contains(tt.s, tt.substr)
|
||||
result := strings.Contains(tt.s, tt.substr)
|
||||
if result != tt.expected {
|
||||
t.Errorf("contains(%q, %q) = %v, expected %v", tt.s, tt.substr, result, tt.expected)
|
||||
}
|
||||
|
||||
@ -249,9 +249,7 @@ var (
|
||||
ErrNoNodesAvailable = &DeploymentError{Message: "no nodes available for deployment"}
|
||||
ErrDeploymentNotFound = &DeploymentError{Message: "deployment not found"}
|
||||
ErrNamespaceNotAssigned = &DeploymentError{Message: "namespace has no home node assigned"}
|
||||
ErrInvalidDeploymentType = &DeploymentError{Message: "invalid deployment type"}
|
||||
ErrSubdomainTaken = &DeploymentError{Message: "subdomain already in use"}
|
||||
ErrDomainReserved = &DeploymentError{Message: "domain is reserved"}
|
||||
)
|
||||
|
||||
// DeploymentError represents a deployment-related error
|
||||
|
||||
@ -429,7 +429,9 @@ func (sg *SecretGenerator) SaveConfig(filename string, content string) error {
|
||||
}
|
||||
|
||||
// Fix ownership
|
||||
exec.Command("chown", "debros:debros", configPath).Run()
|
||||
if err := exec.Command("chown", "debros:debros", configPath).Run(); err != nil {
|
||||
fmt.Printf("Warning: failed to chown %s to debros:debros: %v\n", configPath, err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -3,7 +3,6 @@ package production
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
@ -15,10 +14,7 @@ type NodePreferences struct {
|
||||
AnyoneClient bool `yaml:"anyone_client"`
|
||||
}
|
||||
|
||||
const (
|
||||
preferencesFile = "preferences.yaml"
|
||||
legacyBranchFile = ".branch"
|
||||
)
|
||||
const preferencesFile = "preferences.yaml"
|
||||
|
||||
// SavePreferences saves node preferences to disk
|
||||
func SavePreferences(oramaDir string, prefs *NodePreferences) error {
|
||||
@ -38,10 +34,6 @@ func SavePreferences(oramaDir string, prefs *NodePreferences) error {
|
||||
return err
|
||||
}
|
||||
|
||||
// Also save branch to legacy .branch file for backward compatibility
|
||||
legacyPath := filepath.Join(oramaDir, legacyBranchFile)
|
||||
os.WriteFile(legacyPath, []byte(prefs.Branch), 0644)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -53,7 +45,7 @@ func LoadPreferences(oramaDir string) *NodePreferences {
|
||||
Nameserver: false,
|
||||
}
|
||||
|
||||
// Try to load from preferences.yaml first
|
||||
// Try to load from preferences.yaml
|
||||
path := filepath.Join(oramaDir, preferencesFile)
|
||||
if data, err := os.ReadFile(path); err == nil {
|
||||
if err := yaml.Unmarshal(data, prefs); err == nil {
|
||||
@ -61,15 +53,6 @@ func LoadPreferences(oramaDir string) *NodePreferences {
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to legacy .branch file
|
||||
legacyPath := filepath.Join(oramaDir, legacyBranchFile)
|
||||
if data, err := os.ReadFile(legacyPath); err == nil {
|
||||
branch := strings.TrimSpace(string(data))
|
||||
if branch != "" {
|
||||
prefs.Branch = branch
|
||||
}
|
||||
}
|
||||
|
||||
return prefs
|
||||
}
|
||||
|
||||
|
||||
@ -5,6 +5,7 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
@ -234,31 +235,15 @@ func isPrivateOrLocalHost(host string) bool {
|
||||
}
|
||||
|
||||
// Check for localhost variants
|
||||
if host == "localhost" || host == "::1" {
|
||||
if host == "localhost" {
|
||||
return true
|
||||
}
|
||||
|
||||
// Check common private ranges (basic check)
|
||||
if strings.HasPrefix(host, "10.") ||
|
||||
strings.HasPrefix(host, "192.168.") ||
|
||||
strings.HasPrefix(host, "172.16.") ||
|
||||
strings.HasPrefix(host, "172.17.") ||
|
||||
strings.HasPrefix(host, "172.18.") ||
|
||||
strings.HasPrefix(host, "172.19.") ||
|
||||
strings.HasPrefix(host, "172.20.") ||
|
||||
strings.HasPrefix(host, "172.21.") ||
|
||||
strings.HasPrefix(host, "172.22.") ||
|
||||
strings.HasPrefix(host, "172.23.") ||
|
||||
strings.HasPrefix(host, "172.24.") ||
|
||||
strings.HasPrefix(host, "172.25.") ||
|
||||
strings.HasPrefix(host, "172.26.") ||
|
||||
strings.HasPrefix(host, "172.27.") ||
|
||||
strings.HasPrefix(host, "172.28.") ||
|
||||
strings.HasPrefix(host, "172.29.") ||
|
||||
strings.HasPrefix(host, "172.30.") ||
|
||||
strings.HasPrefix(host, "172.31.") {
|
||||
return true
|
||||
// Parse as IP and use standard library checks
|
||||
ip := net.ParseIP(host)
|
||||
if ip == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return false
|
||||
return ip.IsLoopback() || ip.IsPrivate() || ip.IsLinkLocalUnicast() || ip.IsLinkLocalMulticast()
|
||||
}
|
||||
|
||||
@ -23,9 +23,8 @@ import (
|
||||
type HTTPGateway struct {
|
||||
logger *logging.ColoredLogger
|
||||
config *config.HTTPGatewayConfig
|
||||
router chi.Router
|
||||
reverseProxies map[string]*httputil.ReverseProxy
|
||||
mu sync.RWMutex
|
||||
router chi.Router
|
||||
mu sync.RWMutex
|
||||
server *http.Server
|
||||
}
|
||||
|
||||
@ -46,8 +45,7 @@ func NewHTTPGateway(logger *logging.ColoredLogger, cfg *config.HTTPGatewayConfig
|
||||
gateway := &HTTPGateway{
|
||||
logger: logger,
|
||||
config: cfg,
|
||||
router: chi.NewRouter(),
|
||||
reverseProxies: make(map[string]*httputil.ReverseProxy),
|
||||
router: chi.NewRouter(),
|
||||
}
|
||||
|
||||
// Set up router middleware
|
||||
@ -110,8 +108,6 @@ func (hg *HTTPGateway) initializeRoutes() error {
|
||||
}
|
||||
}
|
||||
|
||||
hg.reverseProxies[routeName] = proxy
|
||||
|
||||
// Register route handler
|
||||
hg.registerRouteHandler(routeName, routeConfig, proxy)
|
||||
|
||||
|
||||
@ -1111,35 +1111,6 @@ func (g *Gateway) getDeploymentByDomain(ctx context.Context, domain string) (*de
|
||||
}
|
||||
}
|
||||
|
||||
// Legacy format: {name}.node-{shortID}.{baseDomain} (backwards compatibility)
|
||||
if len(parts) == 2 && strings.HasPrefix(parts[1], "node-") {
|
||||
deploymentName := parts[0]
|
||||
shortNodeID := parts[1] // e.g., "node-kv4la8"
|
||||
|
||||
// Query by name and matching short node ID
|
||||
query := `
|
||||
SELECT id, namespace, name, type, port, content_cid, status, home_node_id
|
||||
FROM deployments
|
||||
WHERE name = ?
|
||||
AND ('node-' || substr(home_node_id, 9, 6) = ? OR home_node_id = ?)
|
||||
AND status = 'active'
|
||||
LIMIT 1
|
||||
`
|
||||
result, err := db.Query(internalCtx, query, deploymentName, shortNodeID, shortNodeID)
|
||||
if err == nil && len(result.Rows) > 0 {
|
||||
row := result.Rows[0]
|
||||
return &deployments.Deployment{
|
||||
ID: getString(row[0]),
|
||||
Namespace: getString(row[1]),
|
||||
Name: getString(row[2]),
|
||||
Type: deployments.DeploymentType(getString(row[3])),
|
||||
Port: getInt(row[4]),
|
||||
ContentCID: getString(row[5]),
|
||||
Status: deployments.DeploymentStatus(getString(row[6])),
|
||||
HomeNodeID: getString(row[7]),
|
||||
}, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Try custom domain from deployment_domains table
|
||||
|
||||
@ -9,6 +9,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/wireguard"
|
||||
"github.com/libp2p/go-libp2p/core/host"
|
||||
"github.com/libp2p/go-libp2p/core/peer"
|
||||
"github.com/multiformats/go-multiaddr"
|
||||
@ -337,16 +338,21 @@ func (pd *PeerDiscovery) updateHeartbeat(ctx context.Context) error {
|
||||
}
|
||||
|
||||
// GetWireGuardIP detects the local WireGuard IP address using the wg0 network
|
||||
// interface or the WireGuard config file. It does not require a PeerDiscovery
|
||||
// instance and can be called from anywhere in the gateway package.
|
||||
// interface, the 'ip' command, or the WireGuard config file.
|
||||
// It does not require a PeerDiscovery instance and can be called from anywhere
|
||||
// in the gateway package.
|
||||
func GetWireGuardIP() (string, error) {
|
||||
// Method 1: Use 'ip addr show wg0' command (works without root)
|
||||
ip, err := getWireGuardIPFromCommand()
|
||||
if err == nil {
|
||||
// Method 1: Use net.InterfaceByName (shared implementation)
|
||||
if ip, err := wireguard.GetIP(); err == nil {
|
||||
return ip, nil
|
||||
}
|
||||
|
||||
// Method 2: Try to read from WireGuard config file (requires root, may fail)
|
||||
// Method 2: Use 'ip addr show wg0' command (works without root)
|
||||
if ip, err := getWireGuardIPFromCommand(); err == nil {
|
||||
return ip, nil
|
||||
}
|
||||
|
||||
// Method 3: Try to read from WireGuard config file (requires root, may fail)
|
||||
configPath := "/etc/wireguard/wg0.conf"
|
||||
data, err := os.ReadFile(configPath)
|
||||
if err == nil {
|
||||
@ -359,7 +365,6 @@ func GetWireGuardIP() (string, error) {
|
||||
parts := strings.Split(line, "=")
|
||||
if len(parts) == 2 {
|
||||
addrWithCIDR := strings.TrimSpace(parts[1])
|
||||
// Remove /24 suffix
|
||||
ip := strings.Split(addrWithCIDR, "/")[0]
|
||||
ip = strings.TrimSpace(ip)
|
||||
return ip, nil
|
||||
|
||||
229
pkg/inspector/analyzer.go
Normal file
229
pkg/inspector/analyzer.go
Normal file
@ -0,0 +1,229 @@
|
||||
package inspector
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// AnalysisResult holds the AI's analysis of check failures.
|
||||
type AnalysisResult struct {
|
||||
Model string
|
||||
Analysis string
|
||||
Duration time.Duration
|
||||
}
|
||||
|
||||
// Analyze sends failures and cluster context to OpenRouter for AI analysis.
|
||||
func Analyze(results *Results, data *ClusterData, model, apiKey string) (*AnalysisResult, error) {
|
||||
if apiKey == "" {
|
||||
apiKey = os.Getenv("OPENROUTER_API_KEY")
|
||||
}
|
||||
if apiKey == "" {
|
||||
return nil, fmt.Errorf("no API key: set --api-key or OPENROUTER_API_KEY env")
|
||||
}
|
||||
|
||||
// Build the prompt with failures, warnings, and cluster context
|
||||
prompt := buildAnalysisPrompt(results, data)
|
||||
|
||||
start := time.Now()
|
||||
response, err := callOpenRouter(model, apiKey, prompt)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("OpenRouter API call failed: %w", err)
|
||||
}
|
||||
|
||||
return &AnalysisResult{
|
||||
Model: model,
|
||||
Analysis: response,
|
||||
Duration: time.Since(start),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func buildAnalysisPrompt(results *Results, data *ClusterData) string {
|
||||
var b strings.Builder
|
||||
|
||||
// System context
|
||||
b.WriteString("You are a distributed systems expert analyzing health check results for an Orama Network cluster.\n")
|
||||
b.WriteString("The cluster runs RQLite (Raft consensus), Olric (distributed cache), IPFS, CoreDNS, and WireGuard.\n\n")
|
||||
|
||||
// Cluster overview
|
||||
b.WriteString("## Cluster Overview\n")
|
||||
b.WriteString(fmt.Sprintf("Nodes inspected: %d\n", len(data.Nodes)))
|
||||
for host, nd := range data.Nodes {
|
||||
b.WriteString(fmt.Sprintf("- %s (role: %s)\n", host, nd.Node.Role))
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// Summary
|
||||
passed, failed, warned, skipped := results.Summary()
|
||||
b.WriteString(fmt.Sprintf("## Check Results: %d passed, %d failed, %d warnings, %d skipped\n\n", passed, failed, warned, skipped))
|
||||
|
||||
// List all failures
|
||||
failures := results.Failures()
|
||||
if len(failures) > 0 {
|
||||
b.WriteString("## Failures (CRITICAL)\n")
|
||||
for _, f := range failures {
|
||||
node := f.Node
|
||||
if node == "" {
|
||||
node = "cluster-wide"
|
||||
}
|
||||
b.WriteString(fmt.Sprintf("- [%s] %s (%s): %s\n", f.Severity, f.Name, node, f.Message))
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// List all warnings
|
||||
warnings := results.FailuresAndWarnings()
|
||||
warningsOnly := make([]CheckResult, 0)
|
||||
for _, w := range warnings {
|
||||
if w.Status == StatusWarn {
|
||||
warningsOnly = append(warningsOnly, w)
|
||||
}
|
||||
}
|
||||
if len(warningsOnly) > 0 {
|
||||
b.WriteString("## Warnings\n")
|
||||
for _, w := range warningsOnly {
|
||||
node := w.Node
|
||||
if node == "" {
|
||||
node = "cluster-wide"
|
||||
}
|
||||
b.WriteString(fmt.Sprintf("- [%s] %s (%s): %s\n", w.Severity, w.Name, node, w.Message))
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Add raw RQLite status for context (condensed)
|
||||
b.WriteString("## Raw Cluster Data (condensed)\n")
|
||||
for host, nd := range data.Nodes {
|
||||
if nd.RQLite != nil && nd.RQLite.Status != nil {
|
||||
s := nd.RQLite.Status
|
||||
b.WriteString(fmt.Sprintf("### %s (RQLite)\n", host))
|
||||
b.WriteString(fmt.Sprintf(" raft_state=%s term=%d applied=%d commit=%d leader=%s peers=%d voter=%v\n",
|
||||
s.RaftState, s.Term, s.AppliedIndex, s.CommitIndex, s.LeaderNodeID, s.NumPeers, s.Voter))
|
||||
if nd.RQLite.Nodes != nil {
|
||||
b.WriteString(fmt.Sprintf(" /nodes reports %d members:", len(nd.RQLite.Nodes)))
|
||||
for addr, n := range nd.RQLite.Nodes {
|
||||
reachable := "ok"
|
||||
if !n.Reachable {
|
||||
reachable = "UNREACHABLE"
|
||||
}
|
||||
leader := ""
|
||||
if n.Leader {
|
||||
leader = " LEADER"
|
||||
}
|
||||
b.WriteString(fmt.Sprintf(" %s(%s%s)", addr, reachable, leader))
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
b.WriteString("\n## Task\n")
|
||||
b.WriteString("Analyze the failures and warnings above. For each issue:\n")
|
||||
b.WriteString("1. Explain the root cause\n")
|
||||
b.WriteString("2. Assess the severity and impact on the cluster\n")
|
||||
b.WriteString("3. Suggest specific commands or actions to fix it\n")
|
||||
b.WriteString("\nBe concise and actionable. Group related issues together. Use markdown formatting.\n")
|
||||
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// OpenRouter API types (OpenAI-compatible)
|
||||
|
||||
type openRouterRequest struct {
|
||||
Model string `json:"model"`
|
||||
Messages []openRouterMessage `json:"messages"`
|
||||
}
|
||||
|
||||
type openRouterMessage struct {
|
||||
Role string `json:"role"`
|
||||
Content string `json:"content"`
|
||||
}
|
||||
|
||||
type openRouterResponse struct {
|
||||
Choices []struct {
|
||||
Message struct {
|
||||
Content string `json:"content"`
|
||||
} `json:"message"`
|
||||
} `json:"choices"`
|
||||
Error *struct {
|
||||
Message string `json:"message"`
|
||||
Code int `json:"code"`
|
||||
} `json:"error"`
|
||||
}
|
||||
|
||||
func callOpenRouter(model, apiKey, prompt string) (string, error) {
|
||||
reqBody := openRouterRequest{
|
||||
Model: model,
|
||||
Messages: []openRouterMessage{
|
||||
{Role: "user", Content: prompt},
|
||||
},
|
||||
}
|
||||
|
||||
jsonBody, err := json.Marshal(reqBody)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("marshal request: %w", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("POST", "https://openrouter.ai/api/v1/chat/completions", bytes.NewReader(jsonBody))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("create request: %w", err)
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", "Bearer "+apiKey)
|
||||
|
||||
client := &http.Client{Timeout: 120 * time.Second}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("HTTP request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("read response: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("API returned %d: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
var orResp openRouterResponse
|
||||
if err := json.Unmarshal(body, &orResp); err != nil {
|
||||
return "", fmt.Errorf("unmarshal response: %w", err)
|
||||
}
|
||||
|
||||
if orResp.Error != nil {
|
||||
return "", fmt.Errorf("API error: %s", orResp.Error.Message)
|
||||
}
|
||||
|
||||
if len(orResp.Choices) == 0 {
|
||||
return "", fmt.Errorf("no choices in response (raw: %s)", truncate(string(body), 500))
|
||||
}
|
||||
|
||||
content := orResp.Choices[0].Message.Content
|
||||
if strings.TrimSpace(content) == "" {
|
||||
return "", fmt.Errorf("model returned empty response (raw: %s)", truncate(string(body), 500))
|
||||
}
|
||||
|
||||
return content, nil
|
||||
}
|
||||
|
||||
func truncate(s string, max int) string {
|
||||
if len(s) <= max {
|
||||
return s
|
||||
}
|
||||
return s[:max] + "..."
|
||||
}
|
||||
|
||||
// PrintAnalysis writes the AI analysis to the output.
|
||||
func PrintAnalysis(analysis *AnalysisResult, w io.Writer) {
|
||||
fmt.Fprintf(w, "\n## AI Analysis (%s)\n", analysis.Model)
|
||||
fmt.Fprintf(w, "%s\n", strings.Repeat("-", 70))
|
||||
fmt.Fprintf(w, "%s\n", analysis.Analysis)
|
||||
fmt.Fprintf(w, "\n(Analysis took %.1fs)\n", analysis.Duration.Seconds())
|
||||
}
|
||||
172
pkg/inspector/checker.go
Normal file
172
pkg/inspector/checker.go
Normal file
@ -0,0 +1,172 @@
|
||||
package inspector
|
||||
|
||||
import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// Severity levels for check results.
|
||||
type Severity int
|
||||
|
||||
const (
|
||||
Low Severity = iota
|
||||
Medium
|
||||
High
|
||||
Critical
|
||||
)
|
||||
|
||||
func (s Severity) String() string {
|
||||
switch s {
|
||||
case Low:
|
||||
return "LOW"
|
||||
case Medium:
|
||||
return "MEDIUM"
|
||||
case High:
|
||||
return "HIGH"
|
||||
case Critical:
|
||||
return "CRITICAL"
|
||||
default:
|
||||
return "UNKNOWN"
|
||||
}
|
||||
}
|
||||
|
||||
// Status represents the outcome of a check.
|
||||
type Status string
|
||||
|
||||
const (
|
||||
StatusPass Status = "pass"
|
||||
StatusFail Status = "fail"
|
||||
StatusWarn Status = "warn"
|
||||
StatusSkip Status = "skip"
|
||||
)
|
||||
|
||||
// CheckResult holds the outcome of a single health check.
|
||||
type CheckResult struct {
|
||||
ID string `json:"id"` // e.g. "rqlite.leader_exists"
|
||||
Name string `json:"name"` // "Cluster has exactly one leader"
|
||||
Subsystem string `json:"subsystem"` // "rqlite"
|
||||
Severity Severity `json:"severity"`
|
||||
Status Status `json:"status"`
|
||||
Message string `json:"message"` // human-readable detail
|
||||
Node string `json:"node,omitempty"` // which node (empty for cluster-wide)
|
||||
}
|
||||
|
||||
// Results holds all check outcomes.
|
||||
type Results struct {
|
||||
Checks []CheckResult `json:"checks"`
|
||||
Duration time.Duration `json:"duration"`
|
||||
}
|
||||
|
||||
// Summary returns counts by status.
|
||||
func (r *Results) Summary() (passed, failed, warned, skipped int) {
|
||||
for _, c := range r.Checks {
|
||||
switch c.Status {
|
||||
case StatusPass:
|
||||
passed++
|
||||
case StatusFail:
|
||||
failed++
|
||||
case StatusWarn:
|
||||
warned++
|
||||
case StatusSkip:
|
||||
skipped++
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Failures returns only failed checks.
|
||||
func (r *Results) Failures() []CheckResult {
|
||||
var out []CheckResult
|
||||
for _, c := range r.Checks {
|
||||
if c.Status == StatusFail {
|
||||
out = append(out, c)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// FailuresAndWarnings returns failed and warning checks.
|
||||
func (r *Results) FailuresAndWarnings() []CheckResult {
|
||||
var out []CheckResult
|
||||
for _, c := range r.Checks {
|
||||
if c.Status == StatusFail || c.Status == StatusWarn {
|
||||
out = append(out, c)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// CheckFunc is the signature for a subsystem check function.
|
||||
type CheckFunc func(data *ClusterData) []CheckResult
|
||||
|
||||
// SubsystemCheckers maps subsystem names to their check functions.
|
||||
// Populated by checks/ package init or by explicit registration.
|
||||
var SubsystemCheckers = map[string]CheckFunc{}
|
||||
|
||||
// RegisterChecker registers a check function for a subsystem.
|
||||
func RegisterChecker(subsystem string, fn CheckFunc) {
|
||||
SubsystemCheckers[subsystem] = fn
|
||||
}
|
||||
|
||||
// RunChecks executes checks for the requested subsystems against collected data.
|
||||
func RunChecks(data *ClusterData, subsystems []string) *Results {
|
||||
start := time.Now()
|
||||
results := &Results{}
|
||||
|
||||
shouldCheck := func(name string) bool {
|
||||
if len(subsystems) == 0 {
|
||||
return true
|
||||
}
|
||||
for _, s := range subsystems {
|
||||
if s == name || s == "all" {
|
||||
return true
|
||||
}
|
||||
// Alias: "wg" matches "wireguard"
|
||||
if s == "wg" && name == "wireguard" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
for name, fn := range SubsystemCheckers {
|
||||
if shouldCheck(name) {
|
||||
checks := fn(data)
|
||||
results.Checks = append(results.Checks, checks...)
|
||||
}
|
||||
}
|
||||
|
||||
results.Duration = time.Since(start)
|
||||
return results
|
||||
}
|
||||
|
||||
// Pass creates a passing check result.
|
||||
func Pass(id, name, subsystem, node, msg string, sev Severity) CheckResult {
|
||||
return CheckResult{
|
||||
ID: id, Name: name, Subsystem: subsystem,
|
||||
Severity: sev, Status: StatusPass, Message: msg, Node: node,
|
||||
}
|
||||
}
|
||||
|
||||
// Fail creates a failing check result.
|
||||
func Fail(id, name, subsystem, node, msg string, sev Severity) CheckResult {
|
||||
return CheckResult{
|
||||
ID: id, Name: name, Subsystem: subsystem,
|
||||
Severity: sev, Status: StatusFail, Message: msg, Node: node,
|
||||
}
|
||||
}
|
||||
|
||||
// Warn creates a warning check result.
|
||||
func Warn(id, name, subsystem, node, msg string, sev Severity) CheckResult {
|
||||
return CheckResult{
|
||||
ID: id, Name: name, Subsystem: subsystem,
|
||||
Severity: sev, Status: StatusWarn, Message: msg, Node: node,
|
||||
}
|
||||
}
|
||||
|
||||
// Skip creates a skipped check result.
|
||||
func Skip(id, name, subsystem, node, msg string, sev Severity) CheckResult {
|
||||
return CheckResult{
|
||||
ID: id, Name: name, Subsystem: subsystem,
|
||||
Severity: sev, Status: StatusSkip, Message: msg, Node: node,
|
||||
}
|
||||
}
|
||||
190
pkg/inspector/checker_test.go
Normal file
190
pkg/inspector/checker_test.go
Normal file
@ -0,0 +1,190 @@
|
||||
package inspector
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestSummary(t *testing.T) {
|
||||
r := &Results{
|
||||
Checks: []CheckResult{
|
||||
{ID: "a", Status: StatusPass},
|
||||
{ID: "b", Status: StatusPass},
|
||||
{ID: "c", Status: StatusFail},
|
||||
{ID: "d", Status: StatusWarn},
|
||||
{ID: "e", Status: StatusSkip},
|
||||
{ID: "f", Status: StatusPass},
|
||||
},
|
||||
}
|
||||
passed, failed, warned, skipped := r.Summary()
|
||||
if passed != 3 {
|
||||
t.Errorf("passed: want 3, got %d", passed)
|
||||
}
|
||||
if failed != 1 {
|
||||
t.Errorf("failed: want 1, got %d", failed)
|
||||
}
|
||||
if warned != 1 {
|
||||
t.Errorf("warned: want 1, got %d", warned)
|
||||
}
|
||||
if skipped != 1 {
|
||||
t.Errorf("skipped: want 1, got %d", skipped)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFailures(t *testing.T) {
|
||||
r := &Results{
|
||||
Checks: []CheckResult{
|
||||
{ID: "a", Status: StatusPass},
|
||||
{ID: "b", Status: StatusFail},
|
||||
{ID: "c", Status: StatusWarn},
|
||||
{ID: "d", Status: StatusFail},
|
||||
},
|
||||
}
|
||||
failures := r.Failures()
|
||||
if len(failures) != 2 {
|
||||
t.Fatalf("want 2 failures, got %d", len(failures))
|
||||
}
|
||||
for _, f := range failures {
|
||||
if f.Status != StatusFail {
|
||||
t.Errorf("expected StatusFail, got %s for check %s", f.Status, f.ID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFailuresAndWarnings(t *testing.T) {
|
||||
r := &Results{
|
||||
Checks: []CheckResult{
|
||||
{ID: "a", Status: StatusPass},
|
||||
{ID: "b", Status: StatusFail},
|
||||
{ID: "c", Status: StatusWarn},
|
||||
{ID: "d", Status: StatusSkip},
|
||||
},
|
||||
}
|
||||
fw := r.FailuresAndWarnings()
|
||||
if len(fw) != 2 {
|
||||
t.Fatalf("want 2 failures+warnings, got %d", len(fw))
|
||||
}
|
||||
}
|
||||
|
||||
func TestPass(t *testing.T) {
|
||||
c := Pass("test.id", "Test Name", "sub", "node1", "msg", Critical)
|
||||
if c.Status != StatusPass {
|
||||
t.Errorf("want pass, got %s", c.Status)
|
||||
}
|
||||
if c.Severity != Critical {
|
||||
t.Errorf("want Critical, got %s", c.Severity)
|
||||
}
|
||||
if c.Node != "node1" {
|
||||
t.Errorf("want node1, got %s", c.Node)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFail(t *testing.T) {
|
||||
c := Fail("test.id", "Test Name", "sub", "", "msg", High)
|
||||
if c.Status != StatusFail {
|
||||
t.Errorf("want fail, got %s", c.Status)
|
||||
}
|
||||
if c.Node != "" {
|
||||
t.Errorf("want empty node, got %q", c.Node)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWarn(t *testing.T) {
|
||||
c := Warn("test.id", "Test Name", "sub", "n", "msg", Medium)
|
||||
if c.Status != StatusWarn {
|
||||
t.Errorf("want warn, got %s", c.Status)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSkip(t *testing.T) {
|
||||
c := Skip("test.id", "Test Name", "sub", "n", "msg", Low)
|
||||
if c.Status != StatusSkip {
|
||||
t.Errorf("want skip, got %s", c.Status)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSeverityString(t *testing.T) {
|
||||
tests := []struct {
|
||||
sev Severity
|
||||
want string
|
||||
}{
|
||||
{Low, "LOW"},
|
||||
{Medium, "MEDIUM"},
|
||||
{High, "HIGH"},
|
||||
{Critical, "CRITICAL"},
|
||||
{Severity(99), "UNKNOWN"},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.want, func(t *testing.T) {
|
||||
if got := tt.sev.String(); got != tt.want {
|
||||
t.Errorf("Severity(%d).String() = %q, want %q", tt.sev, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunChecks_EmptyData(t *testing.T) {
|
||||
data := &ClusterData{
|
||||
Nodes: map[string]*NodeData{},
|
||||
Duration: time.Second,
|
||||
}
|
||||
results := RunChecks(data, nil)
|
||||
if results == nil {
|
||||
t.Fatal("RunChecks returned nil")
|
||||
}
|
||||
// Should not panic and should return a valid Results
|
||||
}
|
||||
|
||||
func TestRunChecks_FilterBySubsystem(t *testing.T) {
|
||||
// Register a test checker
|
||||
called := map[string]bool{}
|
||||
SubsystemCheckers["test_sub_a"] = func(data *ClusterData) []CheckResult {
|
||||
called["a"] = true
|
||||
return []CheckResult{Pass("a.1", "A1", "test_sub_a", "", "ok", Low)}
|
||||
}
|
||||
SubsystemCheckers["test_sub_b"] = func(data *ClusterData) []CheckResult {
|
||||
called["b"] = true
|
||||
return []CheckResult{Pass("b.1", "B1", "test_sub_b", "", "ok", Low)}
|
||||
}
|
||||
defer delete(SubsystemCheckers, "test_sub_a")
|
||||
defer delete(SubsystemCheckers, "test_sub_b")
|
||||
|
||||
data := &ClusterData{Nodes: map[string]*NodeData{}}
|
||||
|
||||
// Filter to only "test_sub_a"
|
||||
results := RunChecks(data, []string{"test_sub_a"})
|
||||
if !called["a"] {
|
||||
t.Error("test_sub_a checker was not called")
|
||||
}
|
||||
if called["b"] {
|
||||
t.Error("test_sub_b checker should not have been called")
|
||||
}
|
||||
|
||||
found := false
|
||||
for _, c := range results.Checks {
|
||||
if c.ID == "a.1" {
|
||||
found = true
|
||||
}
|
||||
if c.Subsystem == "test_sub_b" {
|
||||
t.Error("should not have checks from test_sub_b")
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Error("expected check a.1 in results")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunChecks_AliasWG(t *testing.T) {
|
||||
called := false
|
||||
SubsystemCheckers["wireguard"] = func(data *ClusterData) []CheckResult {
|
||||
called = true
|
||||
return nil
|
||||
}
|
||||
defer delete(SubsystemCheckers, "wireguard")
|
||||
|
||||
data := &ClusterData{Nodes: map[string]*NodeData{}}
|
||||
RunChecks(data, []string{"wg"})
|
||||
if !called {
|
||||
t.Error("wireguard checker not called via 'wg' alias")
|
||||
}
|
||||
}
|
||||
224
pkg/inspector/checks/dns.go
Normal file
224
pkg/inspector/checks/dns.go
Normal file
@ -0,0 +1,224 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/inspector"
|
||||
)
|
||||
|
||||
func init() {
|
||||
inspector.RegisterChecker("dns", CheckDNS)
|
||||
}
|
||||
|
||||
const dnsSub = "dns"
|
||||
|
||||
// CheckDNS runs all DNS/CoreDNS health checks against cluster data.
|
||||
func CheckDNS(data *inspector.ClusterData) []inspector.CheckResult {
|
||||
var results []inspector.CheckResult
|
||||
|
||||
for _, nd := range data.Nodes {
|
||||
if nd.DNS == nil {
|
||||
continue
|
||||
}
|
||||
results = append(results, checkDNSPerNode(nd)...)
|
||||
}
|
||||
|
||||
results = append(results, checkDNSCrossNode(data)...)
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
func checkDNSPerNode(nd *inspector.NodeData) []inspector.CheckResult {
|
||||
var r []inspector.CheckResult
|
||||
dns := nd.DNS
|
||||
node := nd.Node.Name()
|
||||
|
||||
// 4.1 CoreDNS service running
|
||||
if dns.CoreDNSActive {
|
||||
r = append(r, inspector.Pass("dns.coredns_active", "CoreDNS service active", dnsSub, node,
|
||||
"coredns is active", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("dns.coredns_active", "CoreDNS service active", dnsSub, node,
|
||||
"coredns is not active", inspector.Critical))
|
||||
return r
|
||||
}
|
||||
|
||||
// 4.47 Caddy service running
|
||||
if dns.CaddyActive {
|
||||
r = append(r, inspector.Pass("dns.caddy_active", "Caddy service active", dnsSub, node,
|
||||
"caddy is active", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("dns.caddy_active", "Caddy service active", dnsSub, node,
|
||||
"caddy is not active", inspector.Critical))
|
||||
}
|
||||
|
||||
// 4.8 DNS port 53 bound
|
||||
if dns.Port53Bound {
|
||||
r = append(r, inspector.Pass("dns.port_53", "DNS port 53 bound", dnsSub, node,
|
||||
"UDP 53 is listening", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("dns.port_53", "DNS port 53 bound", dnsSub, node,
|
||||
"UDP 53 is NOT listening", inspector.Critical))
|
||||
}
|
||||
|
||||
// 4.10 HTTP port 80
|
||||
if dns.Port80Bound {
|
||||
r = append(r, inspector.Pass("dns.port_80", "HTTP port 80 bound", dnsSub, node,
|
||||
"TCP 80 is listening", inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("dns.port_80", "HTTP port 80 bound", dnsSub, node,
|
||||
"TCP 80 is NOT listening", inspector.High))
|
||||
}
|
||||
|
||||
// 4.11 HTTPS port 443
|
||||
if dns.Port443Bound {
|
||||
r = append(r, inspector.Pass("dns.port_443", "HTTPS port 443 bound", dnsSub, node,
|
||||
"TCP 443 is listening", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("dns.port_443", "HTTPS port 443 bound", dnsSub, node,
|
||||
"TCP 443 is NOT listening", inspector.Critical))
|
||||
}
|
||||
|
||||
// 4.3 CoreDNS memory
|
||||
if dns.CoreDNSMemMB > 0 {
|
||||
if dns.CoreDNSMemMB < 100 {
|
||||
r = append(r, inspector.Pass("dns.coredns_memory", "CoreDNS memory healthy", dnsSub, node,
|
||||
fmt.Sprintf("RSS=%dMB", dns.CoreDNSMemMB), inspector.Medium))
|
||||
} else if dns.CoreDNSMemMB < 200 {
|
||||
r = append(r, inspector.Warn("dns.coredns_memory", "CoreDNS memory healthy", dnsSub, node,
|
||||
fmt.Sprintf("RSS=%dMB (elevated)", dns.CoreDNSMemMB), inspector.Medium))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("dns.coredns_memory", "CoreDNS memory healthy", dnsSub, node,
|
||||
fmt.Sprintf("RSS=%dMB (high)", dns.CoreDNSMemMB), inspector.High))
|
||||
}
|
||||
}
|
||||
|
||||
// 4.4 CoreDNS restart count
|
||||
if dns.CoreDNSRestarts == 0 {
|
||||
r = append(r, inspector.Pass("dns.coredns_restarts", "CoreDNS low restart count", dnsSub, node,
|
||||
"NRestarts=0", inspector.High))
|
||||
} else if dns.CoreDNSRestarts <= 3 {
|
||||
r = append(r, inspector.Warn("dns.coredns_restarts", "CoreDNS low restart count", dnsSub, node,
|
||||
fmt.Sprintf("NRestarts=%d", dns.CoreDNSRestarts), inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("dns.coredns_restarts", "CoreDNS low restart count", dnsSub, node,
|
||||
fmt.Sprintf("NRestarts=%d (crash-looping?)", dns.CoreDNSRestarts), inspector.High))
|
||||
}
|
||||
|
||||
// 4.7 CoreDNS log error rate
|
||||
if dns.LogErrors == 0 {
|
||||
r = append(r, inspector.Pass("dns.coredns_log_errors", "No recent CoreDNS errors", dnsSub, node,
|
||||
"0 errors in last 5 minutes", inspector.High))
|
||||
} else if dns.LogErrors < 5 {
|
||||
r = append(r, inspector.Warn("dns.coredns_log_errors", "No recent CoreDNS errors", dnsSub, node,
|
||||
fmt.Sprintf("%d errors in last 5 minutes", dns.LogErrors), inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("dns.coredns_log_errors", "No recent CoreDNS errors", dnsSub, node,
|
||||
fmt.Sprintf("%d errors in last 5 minutes", dns.LogErrors), inspector.High))
|
||||
}
|
||||
|
||||
// 4.14 Corefile exists
|
||||
if dns.CorefileExists {
|
||||
r = append(r, inspector.Pass("dns.corefile_exists", "Corefile exists", dnsSub, node,
|
||||
"/etc/coredns/Corefile present", inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("dns.corefile_exists", "Corefile exists", dnsSub, node,
|
||||
"/etc/coredns/Corefile NOT found", inspector.High))
|
||||
}
|
||||
|
||||
// 4.20 SOA resolution
|
||||
if dns.SOAResolves {
|
||||
r = append(r, inspector.Pass("dns.soa_resolves", "SOA record resolves", dnsSub, node,
|
||||
"dig SOA returned result", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("dns.soa_resolves", "SOA record resolves", dnsSub, node,
|
||||
"dig SOA returned no result", inspector.Critical))
|
||||
}
|
||||
|
||||
// 4.21 NS records resolve
|
||||
if dns.NSResolves {
|
||||
r = append(r, inspector.Pass("dns.ns_resolves", "NS records resolve", dnsSub, node,
|
||||
fmt.Sprintf("%d NS records returned", dns.NSRecordCount), inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("dns.ns_resolves", "NS records resolve", dnsSub, node,
|
||||
"dig NS returned no results", inspector.Critical))
|
||||
}
|
||||
|
||||
// 4.23 Wildcard DNS resolution
|
||||
if dns.WildcardResolves {
|
||||
r = append(r, inspector.Pass("dns.wildcard_resolves", "Wildcard DNS resolves", dnsSub, node,
|
||||
"test-wildcard.<domain> returned IP", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("dns.wildcard_resolves", "Wildcard DNS resolves", dnsSub, node,
|
||||
"test-wildcard.<domain> returned no IP", inspector.Critical))
|
||||
}
|
||||
|
||||
// 4.24 Base domain A record
|
||||
if dns.BaseAResolves {
|
||||
r = append(r, inspector.Pass("dns.base_a_resolves", "Base domain A record resolves", dnsSub, node,
|
||||
"<domain> A record returned IP", inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("dns.base_a_resolves", "Base domain A record resolves", dnsSub, node,
|
||||
"<domain> A record returned no IP", inspector.High))
|
||||
}
|
||||
|
||||
// 4.50 TLS certificate - base domain
|
||||
if dns.BaseTLSDaysLeft >= 0 {
|
||||
if dns.BaseTLSDaysLeft > 30 {
|
||||
r = append(r, inspector.Pass("dns.tls_base", "Base domain TLS cert valid", dnsSub, node,
|
||||
fmt.Sprintf("%d days until expiry", dns.BaseTLSDaysLeft), inspector.Critical))
|
||||
} else if dns.BaseTLSDaysLeft > 7 {
|
||||
r = append(r, inspector.Warn("dns.tls_base", "Base domain TLS cert valid", dnsSub, node,
|
||||
fmt.Sprintf("%d days until expiry (expiring soon)", dns.BaseTLSDaysLeft), inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("dns.tls_base", "Base domain TLS cert valid", dnsSub, node,
|
||||
fmt.Sprintf("%d days until expiry (CRITICAL)", dns.BaseTLSDaysLeft), inspector.Critical))
|
||||
}
|
||||
}
|
||||
|
||||
// 4.51 TLS certificate - wildcard
|
||||
if dns.WildTLSDaysLeft >= 0 {
|
||||
if dns.WildTLSDaysLeft > 30 {
|
||||
r = append(r, inspector.Pass("dns.tls_wildcard", "Wildcard TLS cert valid", dnsSub, node,
|
||||
fmt.Sprintf("%d days until expiry", dns.WildTLSDaysLeft), inspector.Critical))
|
||||
} else if dns.WildTLSDaysLeft > 7 {
|
||||
r = append(r, inspector.Warn("dns.tls_wildcard", "Wildcard TLS cert valid", dnsSub, node,
|
||||
fmt.Sprintf("%d days until expiry (expiring soon)", dns.WildTLSDaysLeft), inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("dns.tls_wildcard", "Wildcard TLS cert valid", dnsSub, node,
|
||||
fmt.Sprintf("%d days until expiry (CRITICAL)", dns.WildTLSDaysLeft), inspector.Critical))
|
||||
}
|
||||
}
|
||||
|
||||
return r
|
||||
}
|
||||
|
||||
func checkDNSCrossNode(data *inspector.ClusterData) []inspector.CheckResult {
|
||||
var r []inspector.CheckResult
|
||||
|
||||
activeCount := 0
|
||||
totalNS := 0
|
||||
for _, nd := range data.Nodes {
|
||||
if nd.DNS == nil {
|
||||
continue
|
||||
}
|
||||
totalNS++
|
||||
if nd.DNS.CoreDNSActive {
|
||||
activeCount++
|
||||
}
|
||||
}
|
||||
|
||||
if totalNS == 0 {
|
||||
return r
|
||||
}
|
||||
|
||||
if activeCount == totalNS {
|
||||
r = append(r, inspector.Pass("dns.all_ns_active", "All nameservers running CoreDNS", dnsSub, "",
|
||||
fmt.Sprintf("%d/%d nameservers active", activeCount, totalNS), inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("dns.all_ns_active", "All nameservers running CoreDNS", dnsSub, "",
|
||||
fmt.Sprintf("%d/%d nameservers active", activeCount, totalNS), inspector.Critical))
|
||||
}
|
||||
|
||||
return r
|
||||
}
|
||||
232
pkg/inspector/checks/dns_test.go
Normal file
232
pkg/inspector/checks/dns_test.go
Normal file
@ -0,0 +1,232 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/inspector"
|
||||
)
|
||||
|
||||
func TestCheckDNS_CoreDNSInactive(t *testing.T) {
|
||||
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
|
||||
nd.DNS = &inspector.DNSData{CoreDNSActive: false}
|
||||
|
||||
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
|
||||
results := CheckDNS(data)
|
||||
|
||||
expectStatus(t, results, "dns.coredns_active", inspector.StatusFail)
|
||||
// Early return — no port checks
|
||||
if findCheck(results, "dns.port_53") != nil {
|
||||
t.Error("should not check ports when CoreDNS inactive")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckDNS_HealthyNode(t *testing.T) {
|
||||
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
|
||||
nd.DNS = &inspector.DNSData{
|
||||
CoreDNSActive: true,
|
||||
CaddyActive: true,
|
||||
Port53Bound: true,
|
||||
Port80Bound: true,
|
||||
Port443Bound: true,
|
||||
CoreDNSMemMB: 50,
|
||||
CoreDNSRestarts: 0,
|
||||
LogErrors: 0,
|
||||
CorefileExists: true,
|
||||
SOAResolves: true,
|
||||
NSResolves: true,
|
||||
NSRecordCount: 3,
|
||||
WildcardResolves: true,
|
||||
BaseAResolves: true,
|
||||
BaseTLSDaysLeft: 60,
|
||||
WildTLSDaysLeft: 60,
|
||||
}
|
||||
|
||||
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
|
||||
results := CheckDNS(data)
|
||||
|
||||
expectStatus(t, results, "dns.coredns_active", inspector.StatusPass)
|
||||
expectStatus(t, results, "dns.caddy_active", inspector.StatusPass)
|
||||
expectStatus(t, results, "dns.port_53", inspector.StatusPass)
|
||||
expectStatus(t, results, "dns.port_80", inspector.StatusPass)
|
||||
expectStatus(t, results, "dns.port_443", inspector.StatusPass)
|
||||
expectStatus(t, results, "dns.coredns_memory", inspector.StatusPass)
|
||||
expectStatus(t, results, "dns.coredns_restarts", inspector.StatusPass)
|
||||
expectStatus(t, results, "dns.coredns_log_errors", inspector.StatusPass)
|
||||
expectStatus(t, results, "dns.corefile_exists", inspector.StatusPass)
|
||||
expectStatus(t, results, "dns.soa_resolves", inspector.StatusPass)
|
||||
expectStatus(t, results, "dns.ns_resolves", inspector.StatusPass)
|
||||
expectStatus(t, results, "dns.wildcard_resolves", inspector.StatusPass)
|
||||
expectStatus(t, results, "dns.base_a_resolves", inspector.StatusPass)
|
||||
expectStatus(t, results, "dns.tls_base", inspector.StatusPass)
|
||||
expectStatus(t, results, "dns.tls_wildcard", inspector.StatusPass)
|
||||
}
|
||||
|
||||
func TestCheckDNS_PortsFailing(t *testing.T) {
|
||||
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
|
||||
nd.DNS = &inspector.DNSData{
|
||||
CoreDNSActive: true,
|
||||
Port53Bound: false,
|
||||
Port80Bound: false,
|
||||
Port443Bound: false,
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
|
||||
results := CheckDNS(data)
|
||||
expectStatus(t, results, "dns.port_53", inspector.StatusFail)
|
||||
expectStatus(t, results, "dns.port_80", inspector.StatusWarn)
|
||||
expectStatus(t, results, "dns.port_443", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckDNS_Memory(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
memMB int
|
||||
status inspector.Status
|
||||
}{
|
||||
{"healthy", 50, inspector.StatusPass},
|
||||
{"elevated", 150, inspector.StatusWarn},
|
||||
{"high", 250, inspector.StatusFail},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
|
||||
nd.DNS = &inspector.DNSData{CoreDNSActive: true, CoreDNSMemMB: tt.memMB}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
|
||||
results := CheckDNS(data)
|
||||
expectStatus(t, results, "dns.coredns_memory", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckDNS_Restarts(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
restarts int
|
||||
status inspector.Status
|
||||
}{
|
||||
{"zero", 0, inspector.StatusPass},
|
||||
{"few", 2, inspector.StatusWarn},
|
||||
{"many", 5, inspector.StatusFail},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
|
||||
nd.DNS = &inspector.DNSData{CoreDNSActive: true, CoreDNSRestarts: tt.restarts}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
|
||||
results := CheckDNS(data)
|
||||
expectStatus(t, results, "dns.coredns_restarts", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckDNS_LogErrors(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
errors int
|
||||
status inspector.Status
|
||||
}{
|
||||
{"none", 0, inspector.StatusPass},
|
||||
{"few", 3, inspector.StatusWarn},
|
||||
{"many", 10, inspector.StatusFail},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
|
||||
nd.DNS = &inspector.DNSData{CoreDNSActive: true, LogErrors: tt.errors}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
|
||||
results := CheckDNS(data)
|
||||
expectStatus(t, results, "dns.coredns_log_errors", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckDNS_TLSExpiry(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
days int
|
||||
status inspector.Status
|
||||
}{
|
||||
{"healthy", 60, inspector.StatusPass},
|
||||
{"expiring soon", 20, inspector.StatusWarn},
|
||||
{"critical", 3, inspector.StatusFail},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
|
||||
nd.DNS = &inspector.DNSData{
|
||||
CoreDNSActive: true,
|
||||
BaseTLSDaysLeft: tt.days,
|
||||
WildTLSDaysLeft: tt.days,
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
|
||||
results := CheckDNS(data)
|
||||
expectStatus(t, results, "dns.tls_base", tt.status)
|
||||
expectStatus(t, results, "dns.tls_wildcard", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckDNS_TLSNotChecked(t *testing.T) {
|
||||
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
|
||||
nd.DNS = &inspector.DNSData{
|
||||
CoreDNSActive: true,
|
||||
BaseTLSDaysLeft: -1,
|
||||
WildTLSDaysLeft: -1,
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
|
||||
results := CheckDNS(data)
|
||||
// TLS checks should not be emitted when days == -1
|
||||
if findCheck(results, "dns.tls_base") != nil {
|
||||
t.Error("should not emit tls_base when days == -1")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckDNS_ResolutionFailures(t *testing.T) {
|
||||
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
|
||||
nd.DNS = &inspector.DNSData{
|
||||
CoreDNSActive: true,
|
||||
SOAResolves: false,
|
||||
NSResolves: false,
|
||||
WildcardResolves: false,
|
||||
BaseAResolves: false,
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
|
||||
results := CheckDNS(data)
|
||||
expectStatus(t, results, "dns.soa_resolves", inspector.StatusFail)
|
||||
expectStatus(t, results, "dns.ns_resolves", inspector.StatusFail)
|
||||
expectStatus(t, results, "dns.wildcard_resolves", inspector.StatusFail)
|
||||
expectStatus(t, results, "dns.base_a_resolves", inspector.StatusWarn)
|
||||
}
|
||||
|
||||
func TestCheckDNS_CrossNode_AllActive(t *testing.T) {
|
||||
nodes := map[string]*inspector.NodeData{}
|
||||
for _, host := range []string{"5.5.5.5", "6.6.6.6", "7.7.7.7"} {
|
||||
nd := makeNodeData(host, "nameserver-ns1")
|
||||
nd.DNS = &inspector.DNSData{CoreDNSActive: true}
|
||||
nodes[host] = nd
|
||||
}
|
||||
data := makeCluster(nodes)
|
||||
results := CheckDNS(data)
|
||||
expectStatus(t, results, "dns.all_ns_active", inspector.StatusPass)
|
||||
}
|
||||
|
||||
func TestCheckDNS_CrossNode_PartialActive(t *testing.T) {
|
||||
nodes := map[string]*inspector.NodeData{}
|
||||
active := []bool{true, true, false}
|
||||
for i, host := range []string{"5.5.5.5", "6.6.6.6", "7.7.7.7"} {
|
||||
nd := makeNodeData(host, "nameserver-ns1")
|
||||
nd.DNS = &inspector.DNSData{CoreDNSActive: active[i]}
|
||||
nodes[host] = nd
|
||||
}
|
||||
data := makeCluster(nodes)
|
||||
results := CheckDNS(data)
|
||||
expectStatus(t, results, "dns.all_ns_active", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckDNS_NilData(t *testing.T) {
|
||||
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
|
||||
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
|
||||
results := CheckDNS(data)
|
||||
if len(results) != 0 {
|
||||
t.Errorf("expected 0 results for nil DNS data, got %d", len(results))
|
||||
}
|
||||
}
|
||||
74
pkg/inspector/checks/helpers_test.go
Normal file
74
pkg/inspector/checks/helpers_test.go
Normal file
@ -0,0 +1,74 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/inspector"
|
||||
)
|
||||
|
||||
// makeNode creates a test Node with the given host and role.
|
||||
func makeNode(host, role string) inspector.Node {
|
||||
return inspector.Node{
|
||||
Environment: "devnet",
|
||||
User: "ubuntu",
|
||||
Host: host,
|
||||
Password: "test",
|
||||
Role: role,
|
||||
}
|
||||
}
|
||||
|
||||
// makeNodeData creates a NodeData with a node but no subsystem data.
|
||||
func makeNodeData(host, role string) *inspector.NodeData {
|
||||
return &inspector.NodeData{
|
||||
Node: makeNode(host, role),
|
||||
}
|
||||
}
|
||||
|
||||
// makeCluster creates a ClusterData from a map of host → NodeData.
|
||||
func makeCluster(nodes map[string]*inspector.NodeData) *inspector.ClusterData {
|
||||
return &inspector.ClusterData{
|
||||
Nodes: nodes,
|
||||
Duration: 1 * time.Second,
|
||||
}
|
||||
}
|
||||
|
||||
// countByStatus counts results with the given status.
|
||||
func countByStatus(results []inspector.CheckResult, status inspector.Status) int {
|
||||
n := 0
|
||||
for _, r := range results {
|
||||
if r.Status == status {
|
||||
n++
|
||||
}
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// findCheck returns a pointer to the first check matching the given ID, or nil.
|
||||
func findCheck(results []inspector.CheckResult, id string) *inspector.CheckResult {
|
||||
for i := range results {
|
||||
if results[i].ID == id {
|
||||
return &results[i]
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// requireCheck finds a check by ID and fails the test if not found.
|
||||
func requireCheck(t *testing.T, results []inspector.CheckResult, id string) inspector.CheckResult {
|
||||
t.Helper()
|
||||
c := findCheck(results, id)
|
||||
if c == nil {
|
||||
t.Fatalf("check %q not found in %d results", id, len(results))
|
||||
}
|
||||
return *c
|
||||
}
|
||||
|
||||
// expectStatus asserts that a check with the given ID has the expected status.
|
||||
func expectStatus(t *testing.T, results []inspector.CheckResult, id string, status inspector.Status) {
|
||||
t.Helper()
|
||||
c := requireCheck(t, results, id)
|
||||
if c.Status != status {
|
||||
t.Errorf("check %q: want status=%s, got status=%s (msg=%s)", id, status, c.Status, c.Message)
|
||||
}
|
||||
}
|
||||
232
pkg/inspector/checks/ipfs.go
Normal file
232
pkg/inspector/checks/ipfs.go
Normal file
@ -0,0 +1,232 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/inspector"
|
||||
)
|
||||
|
||||
func init() {
|
||||
inspector.RegisterChecker("ipfs", CheckIPFS)
|
||||
}
|
||||
|
||||
const ipfsSub = "ipfs"
|
||||
|
||||
// CheckIPFS runs all IPFS health checks against cluster data.
|
||||
func CheckIPFS(data *inspector.ClusterData) []inspector.CheckResult {
|
||||
var results []inspector.CheckResult
|
||||
|
||||
for _, nd := range data.Nodes {
|
||||
if nd.IPFS == nil {
|
||||
continue
|
||||
}
|
||||
results = append(results, checkIPFSPerNode(nd, data)...)
|
||||
}
|
||||
|
||||
results = append(results, checkIPFSCrossNode(data)...)
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
func checkIPFSPerNode(nd *inspector.NodeData, data *inspector.ClusterData) []inspector.CheckResult {
|
||||
var r []inspector.CheckResult
|
||||
ipfs := nd.IPFS
|
||||
node := nd.Node.Name()
|
||||
|
||||
// 3.1 IPFS daemon running
|
||||
if ipfs.DaemonActive {
|
||||
r = append(r, inspector.Pass("ipfs.daemon_active", "IPFS daemon active", ipfsSub, node,
|
||||
"debros-ipfs is active", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("ipfs.daemon_active", "IPFS daemon active", ipfsSub, node,
|
||||
"debros-ipfs is not active", inspector.Critical))
|
||||
return r
|
||||
}
|
||||
|
||||
// 3.2 IPFS Cluster running
|
||||
if ipfs.ClusterActive {
|
||||
r = append(r, inspector.Pass("ipfs.cluster_active", "IPFS Cluster active", ipfsSub, node,
|
||||
"debros-ipfs-cluster is active", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("ipfs.cluster_active", "IPFS Cluster active", ipfsSub, node,
|
||||
"debros-ipfs-cluster is not active", inspector.Critical))
|
||||
}
|
||||
|
||||
// 3.6 Swarm peer count
|
||||
expectedNodes := countIPFSNodes(data)
|
||||
if ipfs.SwarmPeerCount >= 0 {
|
||||
expectedPeers := expectedNodes - 1
|
||||
if expectedPeers < 0 {
|
||||
expectedPeers = 0
|
||||
}
|
||||
if ipfs.SwarmPeerCount >= expectedPeers {
|
||||
r = append(r, inspector.Pass("ipfs.swarm_peers", "Swarm peer count sufficient", ipfsSub, node,
|
||||
fmt.Sprintf("peers=%d (expected >=%d)", ipfs.SwarmPeerCount, expectedPeers), inspector.High))
|
||||
} else if ipfs.SwarmPeerCount > 0 {
|
||||
r = append(r, inspector.Warn("ipfs.swarm_peers", "Swarm peer count sufficient", ipfsSub, node,
|
||||
fmt.Sprintf("peers=%d (expected >=%d)", ipfs.SwarmPeerCount, expectedPeers), inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("ipfs.swarm_peers", "Swarm peer count sufficient", ipfsSub, node,
|
||||
fmt.Sprintf("peers=%d (isolated!)", ipfs.SwarmPeerCount), inspector.Critical))
|
||||
}
|
||||
}
|
||||
|
||||
// 3.12 Cluster peer count
|
||||
if ipfs.ClusterPeerCount >= 0 {
|
||||
if ipfs.ClusterPeerCount >= expectedNodes {
|
||||
r = append(r, inspector.Pass("ipfs.cluster_peers", "Cluster peer count matches expected", ipfsSub, node,
|
||||
fmt.Sprintf("cluster_peers=%d (expected=%d)", ipfs.ClusterPeerCount, expectedNodes), inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("ipfs.cluster_peers", "Cluster peer count matches expected", ipfsSub, node,
|
||||
fmt.Sprintf("cluster_peers=%d (expected=%d)", ipfs.ClusterPeerCount, expectedNodes), inspector.Critical))
|
||||
}
|
||||
}
|
||||
|
||||
// 3.14 Cluster peer errors
|
||||
if ipfs.ClusterErrors == 0 {
|
||||
r = append(r, inspector.Pass("ipfs.cluster_errors", "No cluster peer errors", ipfsSub, node,
|
||||
"all cluster peers healthy", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("ipfs.cluster_errors", "No cluster peer errors", ipfsSub, node,
|
||||
fmt.Sprintf("%d peers reporting errors", ipfs.ClusterErrors), inspector.Critical))
|
||||
}
|
||||
|
||||
// 3.20 Repo size vs max
|
||||
if ipfs.RepoMaxBytes > 0 && ipfs.RepoSizeBytes > 0 {
|
||||
pct := float64(ipfs.RepoSizeBytes) / float64(ipfs.RepoMaxBytes) * 100
|
||||
sizeMB := ipfs.RepoSizeBytes / (1024 * 1024)
|
||||
maxMB := ipfs.RepoMaxBytes / (1024 * 1024)
|
||||
if pct < 80 {
|
||||
r = append(r, inspector.Pass("ipfs.repo_size", "Repo size below limit", ipfsSub, node,
|
||||
fmt.Sprintf("repo=%dMB/%dMB (%.0f%%)", sizeMB, maxMB, pct), inspector.High))
|
||||
} else if pct < 95 {
|
||||
r = append(r, inspector.Warn("ipfs.repo_size", "Repo size below limit", ipfsSub, node,
|
||||
fmt.Sprintf("repo=%dMB/%dMB (%.0f%%)", sizeMB, maxMB, pct), inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("ipfs.repo_size", "Repo size below limit", ipfsSub, node,
|
||||
fmt.Sprintf("repo=%dMB/%dMB (%.0f%% NEARLY FULL)", sizeMB, maxMB, pct), inspector.Critical))
|
||||
}
|
||||
}
|
||||
|
||||
// 3.3 Version
|
||||
if ipfs.KuboVersion != "" && ipfs.KuboVersion != "unknown" {
|
||||
r = append(r, inspector.Pass("ipfs.kubo_version", "Kubo version reported", ipfsSub, node,
|
||||
fmt.Sprintf("kubo=%s", ipfs.KuboVersion), inspector.Low))
|
||||
}
|
||||
if ipfs.ClusterVersion != "" && ipfs.ClusterVersion != "unknown" {
|
||||
r = append(r, inspector.Pass("ipfs.cluster_version", "Cluster version reported", ipfsSub, node,
|
||||
fmt.Sprintf("cluster=%s", ipfs.ClusterVersion), inspector.Low))
|
||||
}
|
||||
|
||||
// 3.29 Swarm key exists (private swarm)
|
||||
if ipfs.HasSwarmKey {
|
||||
r = append(r, inspector.Pass("ipfs.swarm_key", "Swarm key exists (private swarm)", ipfsSub, node,
|
||||
"swarm.key present", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("ipfs.swarm_key", "Swarm key exists (private swarm)", ipfsSub, node,
|
||||
"swarm.key NOT found", inspector.Critical))
|
||||
}
|
||||
|
||||
// 3.30 Bootstrap empty (private swarm)
|
||||
if ipfs.BootstrapEmpty {
|
||||
r = append(r, inspector.Pass("ipfs.bootstrap_empty", "Bootstrap list empty (private)", ipfsSub, node,
|
||||
"no public bootstrap peers", inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("ipfs.bootstrap_empty", "Bootstrap list empty (private)", ipfsSub, node,
|
||||
"bootstrap list is not empty (should be empty for private swarm)", inspector.High))
|
||||
}
|
||||
|
||||
return r
|
||||
}
|
||||
|
||||
func checkIPFSCrossNode(data *inspector.ClusterData) []inspector.CheckResult {
|
||||
var r []inspector.CheckResult
|
||||
|
||||
type nodeInfo struct {
|
||||
name string
|
||||
ipfs *inspector.IPFSData
|
||||
}
|
||||
var nodes []nodeInfo
|
||||
for _, nd := range data.Nodes {
|
||||
if nd.IPFS != nil && nd.IPFS.DaemonActive {
|
||||
nodes = append(nodes, nodeInfo{name: nd.Node.Name(), ipfs: nd.IPFS})
|
||||
}
|
||||
}
|
||||
|
||||
if len(nodes) < 2 {
|
||||
return r
|
||||
}
|
||||
|
||||
// Version consistency
|
||||
kuboVersions := map[string][]string{}
|
||||
clusterVersions := map[string][]string{}
|
||||
for _, n := range nodes {
|
||||
if n.ipfs.KuboVersion != "" && n.ipfs.KuboVersion != "unknown" {
|
||||
kuboVersions[n.ipfs.KuboVersion] = append(kuboVersions[n.ipfs.KuboVersion], n.name)
|
||||
}
|
||||
if n.ipfs.ClusterVersion != "" && n.ipfs.ClusterVersion != "unknown" {
|
||||
clusterVersions[n.ipfs.ClusterVersion] = append(clusterVersions[n.ipfs.ClusterVersion], n.name)
|
||||
}
|
||||
}
|
||||
|
||||
if len(kuboVersions) == 1 {
|
||||
for v := range kuboVersions {
|
||||
r = append(r, inspector.Pass("ipfs.kubo_version_consistent", "Kubo version consistent", ipfsSub, "",
|
||||
fmt.Sprintf("version=%s across %d nodes", v, len(nodes)), inspector.Medium))
|
||||
}
|
||||
} else if len(kuboVersions) > 1 {
|
||||
r = append(r, inspector.Warn("ipfs.kubo_version_consistent", "Kubo version consistent", ipfsSub, "",
|
||||
fmt.Sprintf("%d different versions", len(kuboVersions)), inspector.Medium))
|
||||
}
|
||||
|
||||
if len(clusterVersions) == 1 {
|
||||
for v := range clusterVersions {
|
||||
r = append(r, inspector.Pass("ipfs.cluster_version_consistent", "Cluster version consistent", ipfsSub, "",
|
||||
fmt.Sprintf("version=%s across %d nodes", v, len(nodes)), inspector.Medium))
|
||||
}
|
||||
} else if len(clusterVersions) > 1 {
|
||||
r = append(r, inspector.Warn("ipfs.cluster_version_consistent", "Cluster version consistent", ipfsSub, "",
|
||||
fmt.Sprintf("%d different versions", len(clusterVersions)), inspector.Medium))
|
||||
}
|
||||
|
||||
// Repo size convergence
|
||||
var sizes []int64
|
||||
for _, n := range nodes {
|
||||
if n.ipfs.RepoSizeBytes > 0 {
|
||||
sizes = append(sizes, n.ipfs.RepoSizeBytes)
|
||||
}
|
||||
}
|
||||
if len(sizes) >= 2 {
|
||||
minSize, maxSize := sizes[0], sizes[0]
|
||||
for _, s := range sizes[1:] {
|
||||
if s < minSize {
|
||||
minSize = s
|
||||
}
|
||||
if s > maxSize {
|
||||
maxSize = s
|
||||
}
|
||||
}
|
||||
if minSize > 0 {
|
||||
ratio := float64(maxSize) / float64(minSize)
|
||||
if ratio <= 2.0 {
|
||||
r = append(r, inspector.Pass("ipfs.repo_convergence", "Repo size similar across nodes", ipfsSub, "",
|
||||
fmt.Sprintf("ratio=%.1fx", ratio), inspector.Medium))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("ipfs.repo_convergence", "Repo size similar across nodes", ipfsSub, "",
|
||||
fmt.Sprintf("ratio=%.1fx (diverged)", ratio), inspector.Medium))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return r
|
||||
}
|
||||
|
||||
func countIPFSNodes(data *inspector.ClusterData) int {
|
||||
count := 0
|
||||
for _, nd := range data.Nodes {
|
||||
if nd.IPFS != nil {
|
||||
count++
|
||||
}
|
||||
}
|
||||
return count
|
||||
}
|
||||
183
pkg/inspector/checks/ipfs_test.go
Normal file
183
pkg/inspector/checks/ipfs_test.go
Normal file
@ -0,0 +1,183 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/inspector"
|
||||
)
|
||||
|
||||
func TestCheckIPFS_DaemonInactive(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.IPFS = &inspector.IPFSData{DaemonActive: false}
|
||||
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckIPFS(data)
|
||||
|
||||
expectStatus(t, results, "ipfs.daemon_active", inspector.StatusFail)
|
||||
// Early return — no swarm peer checks
|
||||
if findCheck(results, "ipfs.swarm_peers") != nil {
|
||||
t.Error("should not check swarm_peers when daemon inactive")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckIPFS_HealthyNode(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.IPFS = &inspector.IPFSData{
|
||||
DaemonActive: true,
|
||||
ClusterActive: true,
|
||||
SwarmPeerCount: 0, // single node: expected peers = 0
|
||||
ClusterPeerCount: 1, // single node cluster
|
||||
ClusterErrors: 0,
|
||||
RepoSizeBytes: 500 * 1024 * 1024, // 500MB
|
||||
RepoMaxBytes: 1024 * 1024 * 1024, // 1GB
|
||||
KuboVersion: "0.22.0",
|
||||
ClusterVersion: "1.0.8",
|
||||
HasSwarmKey: true,
|
||||
BootstrapEmpty: true,
|
||||
}
|
||||
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckIPFS(data)
|
||||
|
||||
expectStatus(t, results, "ipfs.daemon_active", inspector.StatusPass)
|
||||
expectStatus(t, results, "ipfs.cluster_active", inspector.StatusPass)
|
||||
expectStatus(t, results, "ipfs.swarm_peers", inspector.StatusPass)
|
||||
expectStatus(t, results, "ipfs.cluster_peers", inspector.StatusPass)
|
||||
expectStatus(t, results, "ipfs.cluster_errors", inspector.StatusPass)
|
||||
expectStatus(t, results, "ipfs.repo_size", inspector.StatusPass)
|
||||
expectStatus(t, results, "ipfs.swarm_key", inspector.StatusPass)
|
||||
expectStatus(t, results, "ipfs.bootstrap_empty", inspector.StatusPass)
|
||||
}
|
||||
|
||||
func TestCheckIPFS_SwarmPeers(t *testing.T) {
|
||||
// Single-node cluster: expected peers = 0
|
||||
t.Run("enough", func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 2}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckIPFS(data)
|
||||
// swarm_peers=2, expected=0 → pass
|
||||
expectStatus(t, results, "ipfs.swarm_peers", inspector.StatusPass)
|
||||
})
|
||||
|
||||
t.Run("low but nonzero", func(t *testing.T) {
|
||||
// 3-node cluster: expected peers = 2 per node
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 1} // has 1, expects 2
|
||||
nd2 := makeNodeData("2.2.2.2", "node")
|
||||
nd2.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 2}
|
||||
nd3 := makeNodeData("3.3.3.3", "node")
|
||||
nd3.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 2}
|
||||
data := makeCluster(map[string]*inspector.NodeData{
|
||||
"1.1.1.1": nd, "2.2.2.2": nd2, "3.3.3.3": nd3,
|
||||
})
|
||||
results := CheckIPFS(data)
|
||||
// Node 1.1.1.1 should warn (1 < 2)
|
||||
found := false
|
||||
for _, r := range results {
|
||||
if r.ID == "ipfs.swarm_peers" && r.Node == "ubuntu@1.1.1.1" && r.Status == inspector.StatusWarn {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Error("expected swarm_peers warn for node 1.1.1.1")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("zero isolated", func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 0}
|
||||
nd2 := makeNodeData("2.2.2.2", "node")
|
||||
nd2.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 1}
|
||||
data := makeCluster(map[string]*inspector.NodeData{
|
||||
"1.1.1.1": nd, "2.2.2.2": nd2,
|
||||
})
|
||||
results := CheckIPFS(data)
|
||||
found := false
|
||||
for _, r := range results {
|
||||
if r.ID == "ipfs.swarm_peers" && r.Node == "ubuntu@1.1.1.1" && r.Status == inspector.StatusFail {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Error("expected swarm_peers fail for isolated node 1.1.1.1")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestCheckIPFS_RepoSize(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
size int64
|
||||
max int64
|
||||
status inspector.Status
|
||||
}{
|
||||
{"healthy", 500 * 1024 * 1024, 1024 * 1024 * 1024, inspector.StatusPass}, // 50%
|
||||
{"elevated", 870 * 1024 * 1024, 1024 * 1024 * 1024, inspector.StatusWarn}, // 85%
|
||||
{"nearly full", 980 * 1024 * 1024, 1024 * 1024 * 1024, inspector.StatusFail}, // 96%
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.IPFS = &inspector.IPFSData{
|
||||
DaemonActive: true,
|
||||
RepoSizeBytes: tt.size,
|
||||
RepoMaxBytes: tt.max,
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckIPFS(data)
|
||||
expectStatus(t, results, "ipfs.repo_size", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckIPFS_SwarmKeyMissing(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.IPFS = &inspector.IPFSData{DaemonActive: true, HasSwarmKey: false}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckIPFS(data)
|
||||
expectStatus(t, results, "ipfs.swarm_key", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckIPFS_BootstrapNotEmpty(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.IPFS = &inspector.IPFSData{DaemonActive: true, BootstrapEmpty: false}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckIPFS(data)
|
||||
expectStatus(t, results, "ipfs.bootstrap_empty", inspector.StatusWarn)
|
||||
}
|
||||
|
||||
func TestCheckIPFS_CrossNode_VersionConsistency(t *testing.T) {
|
||||
nodes := map[string]*inspector.NodeData{}
|
||||
for _, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
|
||||
nd := makeNodeData(host, "node")
|
||||
nd.IPFS = &inspector.IPFSData{DaemonActive: true, KuboVersion: "0.22.0", ClusterVersion: "1.0.8"}
|
||||
nodes[host] = nd
|
||||
}
|
||||
data := makeCluster(nodes)
|
||||
results := CheckIPFS(data)
|
||||
expectStatus(t, results, "ipfs.kubo_version_consistent", inspector.StatusPass)
|
||||
expectStatus(t, results, "ipfs.cluster_version_consistent", inspector.StatusPass)
|
||||
}
|
||||
|
||||
func TestCheckIPFS_CrossNode_VersionMismatch(t *testing.T) {
|
||||
nodes := map[string]*inspector.NodeData{}
|
||||
versions := []string{"0.22.0", "0.22.0", "0.21.0"}
|
||||
for i, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
|
||||
nd := makeNodeData(host, "node")
|
||||
nd.IPFS = &inspector.IPFSData{DaemonActive: true, KuboVersion: versions[i]}
|
||||
nodes[host] = nd
|
||||
}
|
||||
data := makeCluster(nodes)
|
||||
results := CheckIPFS(data)
|
||||
expectStatus(t, results, "ipfs.kubo_version_consistent", inspector.StatusWarn)
|
||||
}
|
||||
|
||||
func TestCheckIPFS_NilData(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckIPFS(data)
|
||||
if len(results) != 0 {
|
||||
t.Errorf("expected 0 results for nil IPFS data, got %d", len(results))
|
||||
}
|
||||
}
|
||||
155
pkg/inspector/checks/namespace.go
Normal file
155
pkg/inspector/checks/namespace.go
Normal file
@ -0,0 +1,155 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/inspector"
|
||||
)
|
||||
|
||||
func init() {
|
||||
inspector.RegisterChecker("namespace", CheckNamespace)
|
||||
}
|
||||
|
||||
const nsSub = "namespace"
|
||||
|
||||
// CheckNamespace runs all namespace-level health checks.
|
||||
func CheckNamespace(data *inspector.ClusterData) []inspector.CheckResult {
|
||||
var results []inspector.CheckResult
|
||||
|
||||
for _, nd := range data.Nodes {
|
||||
if len(nd.Namespaces) == 0 {
|
||||
continue
|
||||
}
|
||||
results = append(results, checkNamespacesPerNode(nd)...)
|
||||
}
|
||||
|
||||
results = append(results, checkNamespacesCrossNode(data)...)
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
func checkNamespacesPerNode(nd *inspector.NodeData) []inspector.CheckResult {
|
||||
var r []inspector.CheckResult
|
||||
node := nd.Node.Name()
|
||||
|
||||
for _, ns := range nd.Namespaces {
|
||||
prefix := fmt.Sprintf("ns.%s", ns.Name)
|
||||
|
||||
// RQLite health
|
||||
if ns.RQLiteUp {
|
||||
r = append(r, inspector.Pass(prefix+".rqlite_up", fmt.Sprintf("Namespace %s RQLite responding", ns.Name), nsSub, node,
|
||||
fmt.Sprintf("port_base=%d state=%s", ns.PortBase, ns.RQLiteState), inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail(prefix+".rqlite_up", fmt.Sprintf("Namespace %s RQLite responding", ns.Name), nsSub, node,
|
||||
fmt.Sprintf("port_base=%d not responding", ns.PortBase), inspector.Critical))
|
||||
}
|
||||
|
||||
// RQLite Raft state
|
||||
if ns.RQLiteUp {
|
||||
switch ns.RQLiteState {
|
||||
case "Leader", "Follower":
|
||||
r = append(r, inspector.Pass(prefix+".rqlite_state", fmt.Sprintf("Namespace %s RQLite raft state valid", ns.Name), nsSub, node,
|
||||
fmt.Sprintf("state=%s", ns.RQLiteState), inspector.Critical))
|
||||
case "Candidate":
|
||||
r = append(r, inspector.Warn(prefix+".rqlite_state", fmt.Sprintf("Namespace %s RQLite raft state valid", ns.Name), nsSub, node,
|
||||
"state=Candidate (election in progress)", inspector.Critical))
|
||||
default:
|
||||
r = append(r, inspector.Fail(prefix+".rqlite_state", fmt.Sprintf("Namespace %s RQLite raft state valid", ns.Name), nsSub, node,
|
||||
fmt.Sprintf("state=%s", ns.RQLiteState), inspector.Critical))
|
||||
}
|
||||
}
|
||||
|
||||
// RQLite readiness
|
||||
if ns.RQLiteReady {
|
||||
r = append(r, inspector.Pass(prefix+".rqlite_ready", fmt.Sprintf("Namespace %s RQLite ready", ns.Name), nsSub, node,
|
||||
"/readyz OK", inspector.Critical))
|
||||
} else if ns.RQLiteUp {
|
||||
r = append(r, inspector.Fail(prefix+".rqlite_ready", fmt.Sprintf("Namespace %s RQLite ready", ns.Name), nsSub, node,
|
||||
"/readyz failed", inspector.Critical))
|
||||
}
|
||||
|
||||
// Olric health
|
||||
if ns.OlricUp {
|
||||
r = append(r, inspector.Pass(prefix+".olric_up", fmt.Sprintf("Namespace %s Olric port listening", ns.Name), nsSub, node,
|
||||
"memberlist port bound", inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Fail(prefix+".olric_up", fmt.Sprintf("Namespace %s Olric port listening", ns.Name), nsSub, node,
|
||||
"memberlist port not bound", inspector.High))
|
||||
}
|
||||
|
||||
// Gateway health
|
||||
if ns.GatewayUp {
|
||||
r = append(r, inspector.Pass(prefix+".gateway_up", fmt.Sprintf("Namespace %s Gateway responding", ns.Name), nsSub, node,
|
||||
fmt.Sprintf("HTTP status=%d", ns.GatewayStatus), inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Fail(prefix+".gateway_up", fmt.Sprintf("Namespace %s Gateway responding", ns.Name), nsSub, node,
|
||||
fmt.Sprintf("HTTP status=%d", ns.GatewayStatus), inspector.High))
|
||||
}
|
||||
}
|
||||
|
||||
return r
|
||||
}
|
||||
|
||||
func checkNamespacesCrossNode(data *inspector.ClusterData) []inspector.CheckResult {
|
||||
var r []inspector.CheckResult
|
||||
|
||||
// Collect all namespace names across nodes
|
||||
nsNodes := map[string]int{} // namespace name → count of nodes running it
|
||||
nsHealthy := map[string]int{} // namespace name → count of nodes where all services are up
|
||||
|
||||
for _, nd := range data.Nodes {
|
||||
for _, ns := range nd.Namespaces {
|
||||
nsNodes[ns.Name]++
|
||||
if ns.RQLiteUp && ns.OlricUp && ns.GatewayUp {
|
||||
nsHealthy[ns.Name]++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for name, total := range nsNodes {
|
||||
healthy := nsHealthy[name]
|
||||
if healthy == total {
|
||||
r = append(r, inspector.Pass(
|
||||
fmt.Sprintf("ns.%s.all_healthy", name),
|
||||
fmt.Sprintf("Namespace %s healthy on all nodes", name),
|
||||
nsSub, "",
|
||||
fmt.Sprintf("%d/%d nodes fully healthy", healthy, total),
|
||||
inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail(
|
||||
fmt.Sprintf("ns.%s.all_healthy", name),
|
||||
fmt.Sprintf("Namespace %s healthy on all nodes", name),
|
||||
nsSub, "",
|
||||
fmt.Sprintf("%d/%d nodes fully healthy", healthy, total),
|
||||
inspector.Critical))
|
||||
}
|
||||
|
||||
// Check namespace has quorum (>= N/2+1 RQLite instances)
|
||||
rqliteUp := 0
|
||||
for _, nd := range data.Nodes {
|
||||
for _, ns := range nd.Namespaces {
|
||||
if ns.Name == name && ns.RQLiteUp {
|
||||
rqliteUp++
|
||||
}
|
||||
}
|
||||
}
|
||||
quorumNeeded := total/2 + 1
|
||||
if rqliteUp >= quorumNeeded {
|
||||
r = append(r, inspector.Pass(
|
||||
fmt.Sprintf("ns.%s.quorum", name),
|
||||
fmt.Sprintf("Namespace %s RQLite quorum", name),
|
||||
nsSub, "",
|
||||
fmt.Sprintf("rqlite_up=%d/%d quorum_needed=%d", rqliteUp, total, quorumNeeded),
|
||||
inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail(
|
||||
fmt.Sprintf("ns.%s.quorum", name),
|
||||
fmt.Sprintf("Namespace %s RQLite quorum", name),
|
||||
nsSub, "",
|
||||
fmt.Sprintf("rqlite_up=%d/%d quorum_needed=%d (QUORUM LOST)", rqliteUp, total, quorumNeeded),
|
||||
inspector.Critical))
|
||||
}
|
||||
}
|
||||
|
||||
return r
|
||||
}
|
||||
165
pkg/inspector/checks/namespace_test.go
Normal file
165
pkg/inspector/checks/namespace_test.go
Normal file
@ -0,0 +1,165 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/inspector"
|
||||
)
|
||||
|
||||
func TestCheckNamespace_PerNodeHealthy(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Namespaces = []inspector.NamespaceData{
|
||||
{
|
||||
Name: "myapp",
|
||||
PortBase: 10000,
|
||||
RQLiteUp: true,
|
||||
RQLiteState: "Leader",
|
||||
RQLiteReady: true,
|
||||
OlricUp: true,
|
||||
GatewayUp: true,
|
||||
GatewayStatus: 200,
|
||||
},
|
||||
}
|
||||
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckNamespace(data)
|
||||
|
||||
expectStatus(t, results, "ns.myapp.rqlite_up", inspector.StatusPass)
|
||||
expectStatus(t, results, "ns.myapp.rqlite_state", inspector.StatusPass)
|
||||
expectStatus(t, results, "ns.myapp.rqlite_ready", inspector.StatusPass)
|
||||
expectStatus(t, results, "ns.myapp.olric_up", inspector.StatusPass)
|
||||
expectStatus(t, results, "ns.myapp.gateway_up", inspector.StatusPass)
|
||||
}
|
||||
|
||||
func TestCheckNamespace_RQLiteDown(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Namespaces = []inspector.NamespaceData{
|
||||
{Name: "myapp", PortBase: 10000, RQLiteUp: false},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckNamespace(data)
|
||||
expectStatus(t, results, "ns.myapp.rqlite_up", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckNamespace_RQLiteStates(t *testing.T) {
|
||||
tests := []struct {
|
||||
state string
|
||||
status inspector.Status
|
||||
}{
|
||||
{"Leader", inspector.StatusPass},
|
||||
{"Follower", inspector.StatusPass},
|
||||
{"Candidate", inspector.StatusWarn},
|
||||
{"Unknown", inspector.StatusFail},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.state, func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Namespaces = []inspector.NamespaceData{
|
||||
{Name: "myapp", PortBase: 10000, RQLiteUp: true, RQLiteState: tt.state},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckNamespace(data)
|
||||
expectStatus(t, results, "ns.myapp.rqlite_state", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckNamespace_RQLiteNotReady(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Namespaces = []inspector.NamespaceData{
|
||||
{Name: "myapp", PortBase: 10000, RQLiteUp: true, RQLiteState: "Follower", RQLiteReady: false},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckNamespace(data)
|
||||
expectStatus(t, results, "ns.myapp.rqlite_ready", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckNamespace_OlricDown(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Namespaces = []inspector.NamespaceData{
|
||||
{Name: "myapp", OlricUp: false},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckNamespace(data)
|
||||
expectStatus(t, results, "ns.myapp.olric_up", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckNamespace_GatewayDown(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Namespaces = []inspector.NamespaceData{
|
||||
{Name: "myapp", GatewayUp: false, GatewayStatus: 0},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckNamespace(data)
|
||||
expectStatus(t, results, "ns.myapp.gateway_up", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckNamespace_CrossNode_AllHealthy(t *testing.T) {
|
||||
nodes := map[string]*inspector.NodeData{}
|
||||
for _, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
|
||||
nd := makeNodeData(host, "node")
|
||||
nd.Namespaces = []inspector.NamespaceData{
|
||||
{Name: "myapp", RQLiteUp: true, OlricUp: true, GatewayUp: true},
|
||||
}
|
||||
nodes[host] = nd
|
||||
}
|
||||
data := makeCluster(nodes)
|
||||
results := CheckNamespace(data)
|
||||
expectStatus(t, results, "ns.myapp.all_healthy", inspector.StatusPass)
|
||||
expectStatus(t, results, "ns.myapp.quorum", inspector.StatusPass)
|
||||
}
|
||||
|
||||
func TestCheckNamespace_CrossNode_PartialHealthy(t *testing.T) {
|
||||
nodes := map[string]*inspector.NodeData{}
|
||||
for i, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
|
||||
nd := makeNodeData(host, "node")
|
||||
nd.Namespaces = []inspector.NamespaceData{
|
||||
{Name: "myapp", RQLiteUp: true, OlricUp: i < 2, GatewayUp: true},
|
||||
}
|
||||
nodes[host] = nd
|
||||
}
|
||||
data := makeCluster(nodes)
|
||||
results := CheckNamespace(data)
|
||||
expectStatus(t, results, "ns.myapp.all_healthy", inspector.StatusFail)
|
||||
// Quorum should still pass (3/3 RQLite up, need 2)
|
||||
expectStatus(t, results, "ns.myapp.quorum", inspector.StatusPass)
|
||||
}
|
||||
|
||||
func TestCheckNamespace_CrossNode_QuorumLost(t *testing.T) {
|
||||
nodes := map[string]*inspector.NodeData{}
|
||||
rqliteUp := []bool{true, false, false}
|
||||
for i, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
|
||||
nd := makeNodeData(host, "node")
|
||||
nd.Namespaces = []inspector.NamespaceData{
|
||||
{Name: "myapp", RQLiteUp: rqliteUp[i], OlricUp: true, GatewayUp: true},
|
||||
}
|
||||
nodes[host] = nd
|
||||
}
|
||||
data := makeCluster(nodes)
|
||||
results := CheckNamespace(data)
|
||||
expectStatus(t, results, "ns.myapp.quorum", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckNamespace_MultipleNamespaces(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Namespaces = []inspector.NamespaceData{
|
||||
{Name: "app1", RQLiteUp: true, RQLiteState: "Leader", OlricUp: true, GatewayUp: true},
|
||||
{Name: "app2", RQLiteUp: false, OlricUp: true, GatewayUp: true},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckNamespace(data)
|
||||
|
||||
expectStatus(t, results, "ns.app1.rqlite_up", inspector.StatusPass)
|
||||
expectStatus(t, results, "ns.app2.rqlite_up", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckNamespace_NoNamespaces(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Namespaces = nil
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckNamespace(data)
|
||||
// No per-node results, only cross-node (which should be empty since no namespaces)
|
||||
for _, r := range results {
|
||||
t.Errorf("unexpected check: %s", r.ID)
|
||||
}
|
||||
}
|
||||
113
pkg/inspector/checks/network.go
Normal file
113
pkg/inspector/checks/network.go
Normal file
@ -0,0 +1,113 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/inspector"
|
||||
)
|
||||
|
||||
func init() {
|
||||
inspector.RegisterChecker("network", CheckNetwork)
|
||||
}
|
||||
|
||||
const networkSub = "network"
|
||||
|
||||
// CheckNetwork runs all network-level health checks.
|
||||
func CheckNetwork(data *inspector.ClusterData) []inspector.CheckResult {
|
||||
var results []inspector.CheckResult
|
||||
|
||||
for _, nd := range data.Nodes {
|
||||
if nd.Network == nil {
|
||||
continue
|
||||
}
|
||||
results = append(results, checkNetworkPerNode(nd)...)
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
func checkNetworkPerNode(nd *inspector.NodeData) []inspector.CheckResult {
|
||||
var r []inspector.CheckResult
|
||||
net := nd.Network
|
||||
node := nd.Node.Name()
|
||||
|
||||
// 7.2 Internet connectivity
|
||||
if net.InternetReachable {
|
||||
r = append(r, inspector.Pass("network.internet", "Internet reachable (ping 8.8.8.8)", networkSub, node,
|
||||
"ping 8.8.8.8 succeeded", inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("network.internet", "Internet reachable (ping 8.8.8.8)", networkSub, node,
|
||||
"ping 8.8.8.8 failed", inspector.High))
|
||||
}
|
||||
|
||||
// 7.14 Default route
|
||||
if net.DefaultRoute {
|
||||
r = append(r, inspector.Pass("network.default_route", "Default route exists", networkSub, node,
|
||||
"default route present", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("network.default_route", "Default route exists", networkSub, node,
|
||||
"no default route", inspector.Critical))
|
||||
}
|
||||
|
||||
// 7.15 WG subnet route
|
||||
if net.WGRouteExists {
|
||||
r = append(r, inspector.Pass("network.wg_route", "WG subnet route exists", networkSub, node,
|
||||
"10.0.0.0/24 via wg0 present", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("network.wg_route", "WG subnet route exists", networkSub, node,
|
||||
"10.0.0.0/24 route via wg0 NOT found", inspector.Critical))
|
||||
}
|
||||
|
||||
// 7.4 TCP connections
|
||||
if net.TCPEstablished > 0 {
|
||||
if net.TCPEstablished < 5000 {
|
||||
r = append(r, inspector.Pass("network.tcp_established", "TCP connections reasonable", networkSub, node,
|
||||
fmt.Sprintf("established=%d", net.TCPEstablished), inspector.Medium))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("network.tcp_established", "TCP connections reasonable", networkSub, node,
|
||||
fmt.Sprintf("established=%d (high)", net.TCPEstablished), inspector.Medium))
|
||||
}
|
||||
}
|
||||
|
||||
// 7.6 TIME_WAIT
|
||||
if net.TCPTimeWait < 10000 {
|
||||
r = append(r, inspector.Pass("network.tcp_timewait", "TIME_WAIT count low", networkSub, node,
|
||||
fmt.Sprintf("timewait=%d", net.TCPTimeWait), inspector.Medium))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("network.tcp_timewait", "TIME_WAIT count low", networkSub, node,
|
||||
fmt.Sprintf("timewait=%d (accumulating)", net.TCPTimeWait), inspector.Medium))
|
||||
}
|
||||
|
||||
// 7.8 TCP retransmission rate
|
||||
if net.TCPRetransRate >= 0 {
|
||||
if net.TCPRetransRate < 1 {
|
||||
r = append(r, inspector.Pass("network.tcp_retrans", "TCP retransmission rate low", networkSub, node,
|
||||
fmt.Sprintf("retrans=%.2f%%", net.TCPRetransRate), inspector.Medium))
|
||||
} else if net.TCPRetransRate < 5 {
|
||||
r = append(r, inspector.Warn("network.tcp_retrans", "TCP retransmission rate low", networkSub, node,
|
||||
fmt.Sprintf("retrans=%.2f%% (elevated)", net.TCPRetransRate), inspector.Medium))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("network.tcp_retrans", "TCP retransmission rate low", networkSub, node,
|
||||
fmt.Sprintf("retrans=%.2f%% (high packet loss)", net.TCPRetransRate), inspector.High))
|
||||
}
|
||||
}
|
||||
|
||||
// 7.10 WG mesh peer pings (NxN connectivity)
|
||||
if len(net.PingResults) > 0 {
|
||||
failCount := 0
|
||||
for _, ok := range net.PingResults {
|
||||
if !ok {
|
||||
failCount++
|
||||
}
|
||||
}
|
||||
if failCount == 0 {
|
||||
r = append(r, inspector.Pass("network.wg_mesh_ping", "All WG peers reachable via ping", networkSub, node,
|
||||
fmt.Sprintf("%d/%d peers pingable", len(net.PingResults), len(net.PingResults)), inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("network.wg_mesh_ping", "All WG peers reachable via ping", networkSub, node,
|
||||
fmt.Sprintf("%d/%d peers unreachable", failCount, len(net.PingResults)), inspector.Critical))
|
||||
}
|
||||
}
|
||||
|
||||
return r
|
||||
}
|
||||
151
pkg/inspector/checks/network_test.go
Normal file
151
pkg/inspector/checks/network_test.go
Normal file
@ -0,0 +1,151 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/inspector"
|
||||
)
|
||||
|
||||
func TestCheckNetwork_HealthyNode(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Network = &inspector.NetworkData{
|
||||
InternetReachable: true,
|
||||
DefaultRoute: true,
|
||||
WGRouteExists: true,
|
||||
TCPEstablished: 100,
|
||||
TCPTimeWait: 50,
|
||||
TCPRetransRate: 0.1,
|
||||
PingResults: map[string]bool{"10.0.0.2": true, "10.0.0.3": true},
|
||||
}
|
||||
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckNetwork(data)
|
||||
|
||||
expectStatus(t, results, "network.internet", inspector.StatusPass)
|
||||
expectStatus(t, results, "network.default_route", inspector.StatusPass)
|
||||
expectStatus(t, results, "network.wg_route", inspector.StatusPass)
|
||||
expectStatus(t, results, "network.tcp_established", inspector.StatusPass)
|
||||
expectStatus(t, results, "network.tcp_timewait", inspector.StatusPass)
|
||||
expectStatus(t, results, "network.tcp_retrans", inspector.StatusPass)
|
||||
expectStatus(t, results, "network.wg_mesh_ping", inspector.StatusPass)
|
||||
}
|
||||
|
||||
func TestCheckNetwork_InternetUnreachable(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Network = &inspector.NetworkData{InternetReachable: false}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckNetwork(data)
|
||||
expectStatus(t, results, "network.internet", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckNetwork_MissingRoutes(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Network = &inspector.NetworkData{DefaultRoute: false, WGRouteExists: false}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckNetwork(data)
|
||||
expectStatus(t, results, "network.default_route", inspector.StatusFail)
|
||||
expectStatus(t, results, "network.wg_route", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckNetwork_TCPConnections(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
estab int
|
||||
status inspector.Status
|
||||
}{
|
||||
{"normal", 100, inspector.StatusPass},
|
||||
{"high", 6000, inspector.StatusWarn},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Network = &inspector.NetworkData{TCPEstablished: tt.estab}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckNetwork(data)
|
||||
expectStatus(t, results, "network.tcp_established", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckNetwork_TCPTimeWait(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
tw int
|
||||
status inspector.Status
|
||||
}{
|
||||
{"normal", 50, inspector.StatusPass},
|
||||
{"high", 15000, inspector.StatusWarn},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Network = &inspector.NetworkData{TCPTimeWait: tt.tw}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckNetwork(data)
|
||||
expectStatus(t, results, "network.tcp_timewait", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckNetwork_TCPRetransmission(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
rate float64
|
||||
status inspector.Status
|
||||
}{
|
||||
{"low", 0.1, inspector.StatusPass},
|
||||
{"elevated", 3.0, inspector.StatusWarn},
|
||||
{"high", 8.0, inspector.StatusFail},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Network = &inspector.NetworkData{TCPRetransRate: tt.rate}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckNetwork(data)
|
||||
expectStatus(t, results, "network.tcp_retrans", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckNetwork_WGMeshPing(t *testing.T) {
|
||||
t.Run("all ok", func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Network = &inspector.NetworkData{
|
||||
PingResults: map[string]bool{"10.0.0.2": true, "10.0.0.3": true},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckNetwork(data)
|
||||
expectStatus(t, results, "network.wg_mesh_ping", inspector.StatusPass)
|
||||
})
|
||||
|
||||
t.Run("some fail", func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Network = &inspector.NetworkData{
|
||||
PingResults: map[string]bool{"10.0.0.2": true, "10.0.0.3": false},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckNetwork(data)
|
||||
expectStatus(t, results, "network.wg_mesh_ping", inspector.StatusFail)
|
||||
})
|
||||
|
||||
t.Run("no pings", func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Network = &inspector.NetworkData{PingResults: map[string]bool{}}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckNetwork(data)
|
||||
// No ping results → no wg_mesh_ping check
|
||||
if findCheck(results, "network.wg_mesh_ping") != nil {
|
||||
t.Error("should not emit wg_mesh_ping when no ping results")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestCheckNetwork_NilData(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckNetwork(data)
|
||||
if len(results) != 0 {
|
||||
t.Errorf("expected 0 results for nil Network data, got %d", len(results))
|
||||
}
|
||||
}
|
||||
157
pkg/inspector/checks/olric.go
Normal file
157
pkg/inspector/checks/olric.go
Normal file
@ -0,0 +1,157 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/inspector"
|
||||
)
|
||||
|
||||
func init() {
|
||||
inspector.RegisterChecker("olric", CheckOlric)
|
||||
}
|
||||
|
||||
const olricSub = "olric"
|
||||
|
||||
// CheckOlric runs all Olric health checks against cluster data.
|
||||
func CheckOlric(data *inspector.ClusterData) []inspector.CheckResult {
|
||||
var results []inspector.CheckResult
|
||||
|
||||
for _, nd := range data.Nodes {
|
||||
if nd.Olric == nil {
|
||||
continue
|
||||
}
|
||||
results = append(results, checkOlricPerNode(nd)...)
|
||||
}
|
||||
|
||||
results = append(results, checkOlricCrossNode(data)...)
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
func checkOlricPerNode(nd *inspector.NodeData) []inspector.CheckResult {
|
||||
var r []inspector.CheckResult
|
||||
ol := nd.Olric
|
||||
node := nd.Node.Name()
|
||||
|
||||
// 2.1 Service active
|
||||
if ol.ServiceActive {
|
||||
r = append(r, inspector.Pass("olric.service_active", "Olric service active", olricSub, node,
|
||||
"debros-olric is active", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("olric.service_active", "Olric service active", olricSub, node,
|
||||
"debros-olric is not active", inspector.Critical))
|
||||
return r
|
||||
}
|
||||
|
||||
// 2.7 Memberlist port accepting connections
|
||||
if ol.MemberlistUp {
|
||||
r = append(r, inspector.Pass("olric.memberlist_port", "Memberlist port 3322 listening", olricSub, node,
|
||||
"TCP 3322 is bound", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("olric.memberlist_port", "Memberlist port 3322 listening", olricSub, node,
|
||||
"TCP 3322 is not listening", inspector.Critical))
|
||||
}
|
||||
|
||||
// 2.3 Restart count
|
||||
if ol.RestartCount == 0 {
|
||||
r = append(r, inspector.Pass("olric.restarts", "Low restart count", olricSub, node,
|
||||
"NRestarts=0", inspector.High))
|
||||
} else if ol.RestartCount <= 3 {
|
||||
r = append(r, inspector.Warn("olric.restarts", "Low restart count", olricSub, node,
|
||||
fmt.Sprintf("NRestarts=%d", ol.RestartCount), inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("olric.restarts", "Low restart count", olricSub, node,
|
||||
fmt.Sprintf("NRestarts=%d (crash-looping?)", ol.RestartCount), inspector.High))
|
||||
}
|
||||
|
||||
// 2.4 Process memory
|
||||
if ol.ProcessMemMB > 0 {
|
||||
if ol.ProcessMemMB < 200 {
|
||||
r = append(r, inspector.Pass("olric.memory", "Process memory healthy", olricSub, node,
|
||||
fmt.Sprintf("RSS=%dMB", ol.ProcessMemMB), inspector.Medium))
|
||||
} else if ol.ProcessMemMB < 500 {
|
||||
r = append(r, inspector.Warn("olric.memory", "Process memory healthy", olricSub, node,
|
||||
fmt.Sprintf("RSS=%dMB (elevated)", ol.ProcessMemMB), inspector.Medium))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("olric.memory", "Process memory healthy", olricSub, node,
|
||||
fmt.Sprintf("RSS=%dMB (high)", ol.ProcessMemMB), inspector.High))
|
||||
}
|
||||
}
|
||||
|
||||
// 2.9-2.11 Log analysis: suspects
|
||||
if ol.LogSuspects == 0 {
|
||||
r = append(r, inspector.Pass("olric.log_suspects", "No suspect/failed members in logs", olricSub, node,
|
||||
"no suspect messages in last hour", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("olric.log_suspects", "No suspect/failed members in logs", olricSub, node,
|
||||
fmt.Sprintf("%d suspect/failed messages in last hour", ol.LogSuspects), inspector.Critical))
|
||||
}
|
||||
|
||||
// 2.13 Flapping detection
|
||||
if ol.LogFlapping < 5 {
|
||||
r = append(r, inspector.Pass("olric.log_flapping", "No rapid join/leave cycles", olricSub, node,
|
||||
fmt.Sprintf("join/leave events=%d in last hour", ol.LogFlapping), inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("olric.log_flapping", "No rapid join/leave cycles", olricSub, node,
|
||||
fmt.Sprintf("join/leave events=%d in last hour (flapping?)", ol.LogFlapping), inspector.High))
|
||||
}
|
||||
|
||||
// 2.39 Log error rate
|
||||
if ol.LogErrors < 5 {
|
||||
r = append(r, inspector.Pass("olric.log_errors", "Log error rate low", olricSub, node,
|
||||
fmt.Sprintf("errors=%d in last hour", ol.LogErrors), inspector.High))
|
||||
} else if ol.LogErrors < 20 {
|
||||
r = append(r, inspector.Warn("olric.log_errors", "Log error rate low", olricSub, node,
|
||||
fmt.Sprintf("errors=%d in last hour", ol.LogErrors), inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("olric.log_errors", "Log error rate low", olricSub, node,
|
||||
fmt.Sprintf("errors=%d in last hour (high)", ol.LogErrors), inspector.High))
|
||||
}
|
||||
|
||||
return r
|
||||
}
|
||||
|
||||
func checkOlricCrossNode(data *inspector.ClusterData) []inspector.CheckResult {
|
||||
var r []inspector.CheckResult
|
||||
|
||||
activeCount := 0
|
||||
memberlistCount := 0
|
||||
totalNodes := 0
|
||||
|
||||
for _, nd := range data.Nodes {
|
||||
if nd.Olric == nil {
|
||||
continue
|
||||
}
|
||||
totalNodes++
|
||||
if nd.Olric.ServiceActive {
|
||||
activeCount++
|
||||
}
|
||||
if nd.Olric.MemberlistUp {
|
||||
memberlistCount++
|
||||
}
|
||||
}
|
||||
|
||||
if totalNodes < 2 {
|
||||
return r
|
||||
}
|
||||
|
||||
// All nodes have Olric running
|
||||
if activeCount == totalNodes {
|
||||
r = append(r, inspector.Pass("olric.all_active", "All nodes running Olric", olricSub, "",
|
||||
fmt.Sprintf("%d/%d nodes active", activeCount, totalNodes), inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("olric.all_active", "All nodes running Olric", olricSub, "",
|
||||
fmt.Sprintf("%d/%d nodes active", activeCount, totalNodes), inspector.Critical))
|
||||
}
|
||||
|
||||
// All memberlist ports up
|
||||
if memberlistCount == totalNodes {
|
||||
r = append(r, inspector.Pass("olric.all_memberlist", "All memberlist ports listening", olricSub, "",
|
||||
fmt.Sprintf("%d/%d nodes with memberlist", memberlistCount, totalNodes), inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("olric.all_memberlist", "All memberlist ports listening", olricSub, "",
|
||||
fmt.Sprintf("%d/%d nodes with memberlist", memberlistCount, totalNodes), inspector.High))
|
||||
}
|
||||
|
||||
return r
|
||||
}
|
||||
149
pkg/inspector/checks/olric_test.go
Normal file
149
pkg/inspector/checks/olric_test.go
Normal file
@ -0,0 +1,149 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/inspector"
|
||||
)
|
||||
|
||||
func TestCheckOlric_ServiceInactive(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Olric = &inspector.OlricData{ServiceActive: false}
|
||||
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckOlric(data)
|
||||
|
||||
expectStatus(t, results, "olric.service_active", inspector.StatusFail)
|
||||
// Should return early — no further per-node checks
|
||||
if findCheck(results, "olric.memberlist_port") != nil {
|
||||
t.Error("should not check memberlist when service inactive")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckOlric_HealthyNode(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Olric = &inspector.OlricData{
|
||||
ServiceActive: true,
|
||||
MemberlistUp: true,
|
||||
RestartCount: 0,
|
||||
ProcessMemMB: 100,
|
||||
LogSuspects: 0,
|
||||
LogFlapping: 0,
|
||||
LogErrors: 0,
|
||||
}
|
||||
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckOlric(data)
|
||||
|
||||
expectStatus(t, results, "olric.service_active", inspector.StatusPass)
|
||||
expectStatus(t, results, "olric.memberlist_port", inspector.StatusPass)
|
||||
expectStatus(t, results, "olric.restarts", inspector.StatusPass)
|
||||
expectStatus(t, results, "olric.log_suspects", inspector.StatusPass)
|
||||
expectStatus(t, results, "olric.log_flapping", inspector.StatusPass)
|
||||
expectStatus(t, results, "olric.log_errors", inspector.StatusPass)
|
||||
}
|
||||
|
||||
func TestCheckOlric_RestartCounts(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
restarts int
|
||||
status inspector.Status
|
||||
}{
|
||||
{"zero", 0, inspector.StatusPass},
|
||||
{"few", 2, inspector.StatusWarn},
|
||||
{"many", 5, inspector.StatusFail},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Olric = &inspector.OlricData{ServiceActive: true, RestartCount: tt.restarts}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckOlric(data)
|
||||
expectStatus(t, results, "olric.restarts", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckOlric_Memory(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
memMB int
|
||||
status inspector.Status
|
||||
}{
|
||||
{"healthy", 100, inspector.StatusPass},
|
||||
{"elevated", 300, inspector.StatusWarn},
|
||||
{"high", 600, inspector.StatusFail},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Olric = &inspector.OlricData{ServiceActive: true, ProcessMemMB: tt.memMB}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckOlric(data)
|
||||
expectStatus(t, results, "olric.memory", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckOlric_LogSuspects(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Olric = &inspector.OlricData{ServiceActive: true, LogSuspects: 5}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckOlric(data)
|
||||
expectStatus(t, results, "olric.log_suspects", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckOlric_LogErrors(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
errors int
|
||||
status inspector.Status
|
||||
}{
|
||||
{"none", 0, inspector.StatusPass},
|
||||
{"few", 10, inspector.StatusWarn},
|
||||
{"many", 30, inspector.StatusFail},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.Olric = &inspector.OlricData{ServiceActive: true, LogErrors: tt.errors}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckOlric(data)
|
||||
expectStatus(t, results, "olric.log_errors", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckOlric_CrossNode_AllActive(t *testing.T) {
|
||||
nodes := map[string]*inspector.NodeData{}
|
||||
for _, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
|
||||
nd := makeNodeData(host, "node")
|
||||
nd.Olric = &inspector.OlricData{ServiceActive: true, MemberlistUp: true}
|
||||
nodes[host] = nd
|
||||
}
|
||||
data := makeCluster(nodes)
|
||||
results := CheckOlric(data)
|
||||
expectStatus(t, results, "olric.all_active", inspector.StatusPass)
|
||||
expectStatus(t, results, "olric.all_memberlist", inspector.StatusPass)
|
||||
}
|
||||
|
||||
func TestCheckOlric_CrossNode_PartialActive(t *testing.T) {
|
||||
nodes := map[string]*inspector.NodeData{}
|
||||
for i, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
|
||||
nd := makeNodeData(host, "node")
|
||||
nd.Olric = &inspector.OlricData{ServiceActive: i < 2, MemberlistUp: i < 2}
|
||||
nodes[host] = nd
|
||||
}
|
||||
data := makeCluster(nodes)
|
||||
results := CheckOlric(data)
|
||||
expectStatus(t, results, "olric.all_active", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckOlric_NilData(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckOlric(data)
|
||||
if len(results) != 0 {
|
||||
t.Errorf("expected 0 results for nil Olric data, got %d", len(results))
|
||||
}
|
||||
}
|
||||
533
pkg/inspector/checks/rqlite.go
Normal file
533
pkg/inspector/checks/rqlite.go
Normal file
@ -0,0 +1,533 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"strings"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/inspector"
|
||||
)
|
||||
|
||||
func init() {
|
||||
inspector.RegisterChecker("rqlite", CheckRQLite)
|
||||
}
|
||||
|
||||
const rqliteSub = "rqlite"
|
||||
|
||||
// CheckRQLite runs all RQLite health checks against cluster data.
|
||||
func CheckRQLite(data *inspector.ClusterData) []inspector.CheckResult {
|
||||
var results []inspector.CheckResult
|
||||
|
||||
// Per-node checks
|
||||
for _, nd := range data.Nodes {
|
||||
if nd.RQLite == nil {
|
||||
continue
|
||||
}
|
||||
results = append(results, checkRQLitePerNode(nd, data)...)
|
||||
}
|
||||
|
||||
// Cross-node checks
|
||||
results = append(results, checkRQLiteCrossNode(data)...)
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
func checkRQLitePerNode(nd *inspector.NodeData, data *inspector.ClusterData) []inspector.CheckResult {
|
||||
var r []inspector.CheckResult
|
||||
rq := nd.RQLite
|
||||
node := nd.Node.Name()
|
||||
|
||||
// 1.2 HTTP endpoint responsive
|
||||
if !rq.Responsive {
|
||||
r = append(r, inspector.Fail("rqlite.responsive", "RQLite HTTP endpoint responsive", rqliteSub, node,
|
||||
"curl localhost:5001/status failed or returned error", inspector.Critical))
|
||||
return r
|
||||
}
|
||||
r = append(r, inspector.Pass("rqlite.responsive", "RQLite HTTP endpoint responsive", rqliteSub, node,
|
||||
"responding on port 5001", inspector.Critical))
|
||||
|
||||
// 1.3 Full readiness (/readyz)
|
||||
if rq.Readyz != nil {
|
||||
if rq.Readyz.Ready {
|
||||
r = append(r, inspector.Pass("rqlite.readyz", "Full readiness check", rqliteSub, node,
|
||||
"node, leader, store all ready", inspector.Critical))
|
||||
} else {
|
||||
var parts []string
|
||||
if rq.Readyz.Node != "ready" {
|
||||
parts = append(parts, "node: "+rq.Readyz.Node)
|
||||
}
|
||||
if rq.Readyz.Leader != "ready" {
|
||||
parts = append(parts, "leader: "+rq.Readyz.Leader)
|
||||
}
|
||||
if rq.Readyz.Store != "ready" {
|
||||
parts = append(parts, "store: "+rq.Readyz.Store)
|
||||
}
|
||||
r = append(r, inspector.Fail("rqlite.readyz", "Full readiness check", rqliteSub, node,
|
||||
"not ready: "+strings.Join(parts, ", "), inspector.Critical))
|
||||
}
|
||||
}
|
||||
|
||||
s := rq.Status
|
||||
if s == nil {
|
||||
r = append(r, inspector.Skip("rqlite.status_parsed", "Status JSON parseable", rqliteSub, node,
|
||||
"could not parse /status response", inspector.Critical))
|
||||
return r
|
||||
}
|
||||
|
||||
// 1.5 Raft state valid
|
||||
switch s.RaftState {
|
||||
case "Leader", "Follower":
|
||||
r = append(r, inspector.Pass("rqlite.raft_state", "Raft state valid", rqliteSub, node,
|
||||
fmt.Sprintf("state=%s", s.RaftState), inspector.Critical))
|
||||
case "Candidate":
|
||||
r = append(r, inspector.Warn("rqlite.raft_state", "Raft state valid", rqliteSub, node,
|
||||
"state=Candidate (election in progress)", inspector.Critical))
|
||||
case "Shutdown":
|
||||
r = append(r, inspector.Fail("rqlite.raft_state", "Raft state valid", rqliteSub, node,
|
||||
"state=Shutdown", inspector.Critical))
|
||||
default:
|
||||
r = append(r, inspector.Fail("rqlite.raft_state", "Raft state valid", rqliteSub, node,
|
||||
fmt.Sprintf("unexpected state=%q", s.RaftState), inspector.Critical))
|
||||
}
|
||||
|
||||
// 1.7 Leader identity known
|
||||
if s.LeaderNodeID == "" {
|
||||
r = append(r, inspector.Fail("rqlite.leader_known", "Leader identity known", rqliteSub, node,
|
||||
"leader node_id is empty", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Pass("rqlite.leader_known", "Leader identity known", rqliteSub, node,
|
||||
fmt.Sprintf("leader=%s", s.LeaderNodeID), inspector.Critical))
|
||||
}
|
||||
|
||||
// 1.8 Voter status
|
||||
if s.Voter {
|
||||
r = append(r, inspector.Pass("rqlite.voter", "Node is voter", rqliteSub, node,
|
||||
"voter=true", inspector.Low))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("rqlite.voter", "Node is voter", rqliteSub, node,
|
||||
"voter=false (non-voter)", inspector.Low))
|
||||
}
|
||||
|
||||
// 1.9 Num peers — use the node's own /nodes endpoint to determine cluster size
|
||||
// (not config file, since not all config nodes are necessarily in the Raft cluster)
|
||||
if rq.Nodes != nil && len(rq.Nodes) > 0 {
|
||||
expectedPeers := len(rq.Nodes) - 1 // cluster members minus self
|
||||
if expectedPeers < 0 {
|
||||
expectedPeers = 0
|
||||
}
|
||||
if s.NumPeers == expectedPeers {
|
||||
r = append(r, inspector.Pass("rqlite.num_peers", "Peer count matches cluster size", rqliteSub, node,
|
||||
fmt.Sprintf("peers=%d (cluster has %d nodes)", s.NumPeers, len(rq.Nodes)), inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("rqlite.num_peers", "Peer count matches cluster size", rqliteSub, node,
|
||||
fmt.Sprintf("peers=%d but /nodes reports %d members", s.NumPeers, len(rq.Nodes)), inspector.High))
|
||||
}
|
||||
} else {
|
||||
r = append(r, inspector.Pass("rqlite.num_peers", "Peer count reported", rqliteSub, node,
|
||||
fmt.Sprintf("peers=%d", s.NumPeers), inspector.Medium))
|
||||
}
|
||||
|
||||
// 1.11 Commit index vs applied index
|
||||
if s.CommitIndex > 0 && s.AppliedIndex > 0 {
|
||||
gap := s.CommitIndex - s.AppliedIndex
|
||||
if s.AppliedIndex > s.CommitIndex {
|
||||
gap = 0
|
||||
}
|
||||
if gap <= 2 {
|
||||
r = append(r, inspector.Pass("rqlite.commit_applied_gap", "Commit/applied index close", rqliteSub, node,
|
||||
fmt.Sprintf("commit=%d applied=%d gap=%d", s.CommitIndex, s.AppliedIndex, gap), inspector.Critical))
|
||||
} else if gap <= 100 {
|
||||
r = append(r, inspector.Warn("rqlite.commit_applied_gap", "Commit/applied index close", rqliteSub, node,
|
||||
fmt.Sprintf("commit=%d applied=%d gap=%d (lagging)", s.CommitIndex, s.AppliedIndex, gap), inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("rqlite.commit_applied_gap", "Commit/applied index close", rqliteSub, node,
|
||||
fmt.Sprintf("commit=%d applied=%d gap=%d (severely behind)", s.CommitIndex, s.AppliedIndex, gap), inspector.Critical))
|
||||
}
|
||||
}
|
||||
|
||||
// 1.12 FSM pending
|
||||
if s.FsmPending == 0 {
|
||||
r = append(r, inspector.Pass("rqlite.fsm_pending", "FSM pending queue empty", rqliteSub, node,
|
||||
"fsm_pending=0", inspector.High))
|
||||
} else if s.FsmPending <= 10 {
|
||||
r = append(r, inspector.Warn("rqlite.fsm_pending", "FSM pending queue empty", rqliteSub, node,
|
||||
fmt.Sprintf("fsm_pending=%d", s.FsmPending), inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("rqlite.fsm_pending", "FSM pending queue empty", rqliteSub, node,
|
||||
fmt.Sprintf("fsm_pending=%d (backlog)", s.FsmPending), inspector.High))
|
||||
}
|
||||
|
||||
// 1.13 Last contact (followers only)
|
||||
if s.RaftState == "Follower" && s.LastContact != "" {
|
||||
r = append(r, inspector.Pass("rqlite.last_contact", "Follower last contact recent", rqliteSub, node,
|
||||
fmt.Sprintf("last_contact=%s", s.LastContact), inspector.Critical))
|
||||
}
|
||||
|
||||
// 1.14 Last log term matches current term
|
||||
if s.LastLogTerm > 0 && s.Term > 0 {
|
||||
if s.LastLogTerm == s.Term {
|
||||
r = append(r, inspector.Pass("rqlite.log_term_match", "Last log term matches current", rqliteSub, node,
|
||||
fmt.Sprintf("term=%d last_log_term=%d", s.Term, s.LastLogTerm), inspector.Medium))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("rqlite.log_term_match", "Last log term matches current", rqliteSub, node,
|
||||
fmt.Sprintf("term=%d last_log_term=%d (mismatch)", s.Term, s.LastLogTerm), inspector.Medium))
|
||||
}
|
||||
}
|
||||
|
||||
// 1.15 db_applied_index == fsm_index
|
||||
if s.DBAppliedIndex > 0 && s.FsmIndex > 0 {
|
||||
if s.DBAppliedIndex == s.FsmIndex {
|
||||
r = append(r, inspector.Pass("rqlite.db_fsm_sync", "DB applied index matches FSM index", rqliteSub, node,
|
||||
fmt.Sprintf("db_applied=%d fsm=%d", s.DBAppliedIndex, s.FsmIndex), inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("rqlite.db_fsm_sync", "DB applied index matches FSM index", rqliteSub, node,
|
||||
fmt.Sprintf("db_applied=%d fsm=%d (diverged)", s.DBAppliedIndex, s.FsmIndex), inspector.Critical))
|
||||
}
|
||||
}
|
||||
|
||||
// 1.18 Last snapshot index close to applied
|
||||
if s.LastSnapshot > 0 && s.AppliedIndex > 0 {
|
||||
gap := s.AppliedIndex - s.LastSnapshot
|
||||
if s.LastSnapshot > s.AppliedIndex {
|
||||
gap = 0
|
||||
}
|
||||
if gap < 10000 {
|
||||
r = append(r, inspector.Pass("rqlite.snapshot_recent", "Snapshot recent", rqliteSub, node,
|
||||
fmt.Sprintf("snapshot_index=%d applied=%d gap=%d", s.LastSnapshot, s.AppliedIndex, gap), inspector.Medium))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("rqlite.snapshot_recent", "Snapshot recent", rqliteSub, node,
|
||||
fmt.Sprintf("snapshot_index=%d applied=%d gap=%d (old snapshot)", s.LastSnapshot, s.AppliedIndex, gap), inspector.Medium))
|
||||
}
|
||||
}
|
||||
|
||||
// 1.19 At least 1 snapshot exists
|
||||
if s.LastSnapshot > 0 {
|
||||
r = append(r, inspector.Pass("rqlite.has_snapshot", "At least one snapshot exists", rqliteSub, node,
|
||||
fmt.Sprintf("last_snapshot_index=%d", s.LastSnapshot), inspector.Medium))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("rqlite.has_snapshot", "At least one snapshot exists", rqliteSub, node,
|
||||
"no snapshots found", inspector.Medium))
|
||||
}
|
||||
|
||||
// 1.27 Database size
|
||||
if s.DBSizeFriendly != "" {
|
||||
r = append(r, inspector.Pass("rqlite.db_size", "Database size reported", rqliteSub, node,
|
||||
fmt.Sprintf("db_size=%s", s.DBSizeFriendly), inspector.Low))
|
||||
}
|
||||
|
||||
// 1.31 Goroutine count
|
||||
if s.Goroutines > 0 {
|
||||
if s.Goroutines < 200 {
|
||||
r = append(r, inspector.Pass("rqlite.goroutines", "Goroutine count healthy", rqliteSub, node,
|
||||
fmt.Sprintf("goroutines=%d", s.Goroutines), inspector.Medium))
|
||||
} else if s.Goroutines < 1000 {
|
||||
r = append(r, inspector.Warn("rqlite.goroutines", "Goroutine count healthy", rqliteSub, node,
|
||||
fmt.Sprintf("goroutines=%d (elevated)", s.Goroutines), inspector.Medium))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("rqlite.goroutines", "Goroutine count healthy", rqliteSub, node,
|
||||
fmt.Sprintf("goroutines=%d (high)", s.Goroutines), inspector.High))
|
||||
}
|
||||
}
|
||||
|
||||
// 1.32 Memory (HeapAlloc)
|
||||
if s.HeapAlloc > 0 {
|
||||
mb := s.HeapAlloc / (1024 * 1024)
|
||||
if mb < 500 {
|
||||
r = append(r, inspector.Pass("rqlite.memory", "Memory usage healthy", rqliteSub, node,
|
||||
fmt.Sprintf("heap=%dMB", mb), inspector.Medium))
|
||||
} else if mb < 1000 {
|
||||
r = append(r, inspector.Warn("rqlite.memory", "Memory usage healthy", rqliteSub, node,
|
||||
fmt.Sprintf("heap=%dMB (elevated)", mb), inspector.Medium))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("rqlite.memory", "Memory usage healthy", rqliteSub, node,
|
||||
fmt.Sprintf("heap=%dMB (high)", mb), inspector.High))
|
||||
}
|
||||
}
|
||||
|
||||
// 1.35 Version reported
|
||||
if s.Version != "" {
|
||||
r = append(r, inspector.Pass("rqlite.version", "Version reported", rqliteSub, node,
|
||||
fmt.Sprintf("version=%s", s.Version), inspector.Low))
|
||||
}
|
||||
|
||||
// Node reachability from /nodes endpoint
|
||||
if rq.Nodes != nil {
|
||||
unreachable := 0
|
||||
for addr, n := range rq.Nodes {
|
||||
if !n.Reachable {
|
||||
unreachable++
|
||||
r = append(r, inspector.Fail("rqlite.node_reachable", "Cluster node reachable", rqliteSub, node,
|
||||
fmt.Sprintf("%s is unreachable from this node", addr), inspector.Critical))
|
||||
}
|
||||
}
|
||||
if unreachable == 0 {
|
||||
r = append(r, inspector.Pass("rqlite.all_reachable", "All cluster nodes reachable", rqliteSub, node,
|
||||
fmt.Sprintf("all %d nodes reachable", len(rq.Nodes)), inspector.Critical))
|
||||
}
|
||||
}
|
||||
|
||||
// 1.46 Strong read test
|
||||
if rq.StrongRead {
|
||||
r = append(r, inspector.Pass("rqlite.strong_read", "Strong read succeeds", rqliteSub, node,
|
||||
"SELECT 1 at level=strong OK", inspector.Critical))
|
||||
} else if rq.Responsive {
|
||||
r = append(r, inspector.Fail("rqlite.strong_read", "Strong read succeeds", rqliteSub, node,
|
||||
"SELECT 1 at level=strong failed", inspector.Critical))
|
||||
}
|
||||
|
||||
// Debug vars checks
|
||||
if dv := rq.DebugVars; dv != nil {
|
||||
// 1.28 Query errors
|
||||
if dv.QueryErrors == 0 {
|
||||
r = append(r, inspector.Pass("rqlite.query_errors", "No query errors", rqliteSub, node,
|
||||
"query_errors=0", inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("rqlite.query_errors", "No query errors", rqliteSub, node,
|
||||
fmt.Sprintf("query_errors=%d", dv.QueryErrors), inspector.High))
|
||||
}
|
||||
|
||||
// 1.29 Execute errors
|
||||
if dv.ExecuteErrors == 0 {
|
||||
r = append(r, inspector.Pass("rqlite.execute_errors", "No execute errors", rqliteSub, node,
|
||||
"execute_errors=0", inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("rqlite.execute_errors", "No execute errors", rqliteSub, node,
|
||||
fmt.Sprintf("execute_errors=%d", dv.ExecuteErrors), inspector.High))
|
||||
}
|
||||
|
||||
// 1.30 Leader not found events
|
||||
if dv.LeaderNotFound == 0 {
|
||||
r = append(r, inspector.Pass("rqlite.leader_not_found", "No leader-not-found events", rqliteSub, node,
|
||||
"leader_not_found=0", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("rqlite.leader_not_found", "No leader-not-found events", rqliteSub, node,
|
||||
fmt.Sprintf("leader_not_found=%d", dv.LeaderNotFound), inspector.Critical))
|
||||
}
|
||||
|
||||
// Snapshot errors
|
||||
if dv.SnapshotErrors == 0 {
|
||||
r = append(r, inspector.Pass("rqlite.snapshot_errors", "No snapshot errors", rqliteSub, node,
|
||||
"snapshot_errors=0", inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("rqlite.snapshot_errors", "No snapshot errors", rqliteSub, node,
|
||||
fmt.Sprintf("snapshot_errors=%d", dv.SnapshotErrors), inspector.High))
|
||||
}
|
||||
|
||||
// Client retries/timeouts
|
||||
if dv.ClientRetries == 0 && dv.ClientTimeouts == 0 {
|
||||
r = append(r, inspector.Pass("rqlite.client_health", "No client retries or timeouts", rqliteSub, node,
|
||||
"retries=0 timeouts=0", inspector.Medium))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("rqlite.client_health", "No client retries or timeouts", rqliteSub, node,
|
||||
fmt.Sprintf("retries=%d timeouts=%d", dv.ClientRetries, dv.ClientTimeouts), inspector.Medium))
|
||||
}
|
||||
}
|
||||
|
||||
return r
|
||||
}
|
||||
|
||||
func checkRQLiteCrossNode(data *inspector.ClusterData) []inspector.CheckResult {
|
||||
var r []inspector.CheckResult
|
||||
|
||||
type nodeInfo struct {
|
||||
host string
|
||||
name string
|
||||
status *inspector.RQLiteStatus
|
||||
}
|
||||
var nodes []nodeInfo
|
||||
for host, nd := range data.Nodes {
|
||||
if nd.RQLite != nil && nd.RQLite.Status != nil {
|
||||
nodes = append(nodes, nodeInfo{host: host, name: nd.Node.Name(), status: nd.RQLite.Status})
|
||||
}
|
||||
}
|
||||
|
||||
if len(nodes) < 2 {
|
||||
r = append(r, inspector.Skip("rqlite.cross_node", "Cross-node checks", rqliteSub, "",
|
||||
fmt.Sprintf("only %d node(s) with RQLite data, need >=2", len(nodes)), inspector.Critical))
|
||||
return r
|
||||
}
|
||||
|
||||
// 1.5 Exactly one leader
|
||||
leaders := 0
|
||||
var leaderName string
|
||||
for _, n := range nodes {
|
||||
if n.status.RaftState == "Leader" {
|
||||
leaders++
|
||||
leaderName = n.name
|
||||
}
|
||||
}
|
||||
switch leaders {
|
||||
case 1:
|
||||
r = append(r, inspector.Pass("rqlite.single_leader", "Exactly one leader in cluster", rqliteSub, "",
|
||||
fmt.Sprintf("leader=%s", leaderName), inspector.Critical))
|
||||
case 0:
|
||||
r = append(r, inspector.Fail("rqlite.single_leader", "Exactly one leader in cluster", rqliteSub, "",
|
||||
"no leader found", inspector.Critical))
|
||||
default:
|
||||
r = append(r, inspector.Fail("rqlite.single_leader", "Exactly one leader in cluster", rqliteSub, "",
|
||||
fmt.Sprintf("found %d leaders (split brain!)", leaders), inspector.Critical))
|
||||
}
|
||||
|
||||
// 1.6 Term consistency
|
||||
terms := map[uint64][]string{}
|
||||
for _, n := range nodes {
|
||||
terms[n.status.Term] = append(terms[n.status.Term], n.name)
|
||||
}
|
||||
if len(terms) == 1 {
|
||||
for t := range terms {
|
||||
r = append(r, inspector.Pass("rqlite.term_consistent", "All nodes same Raft term", rqliteSub, "",
|
||||
fmt.Sprintf("term=%d across %d nodes", t, len(nodes)), inspector.Critical))
|
||||
}
|
||||
} else {
|
||||
var parts []string
|
||||
for t, names := range terms {
|
||||
parts = append(parts, fmt.Sprintf("term=%d: %s", t, strings.Join(names, ",")))
|
||||
}
|
||||
r = append(r, inspector.Fail("rqlite.term_consistent", "All nodes same Raft term", rqliteSub, "",
|
||||
"term divergence: "+strings.Join(parts, "; "), inspector.Critical))
|
||||
}
|
||||
|
||||
// 1.36 All nodes agree on same leader
|
||||
leaderIDs := map[string][]string{}
|
||||
for _, n := range nodes {
|
||||
leaderIDs[n.status.LeaderNodeID] = append(leaderIDs[n.status.LeaderNodeID], n.name)
|
||||
}
|
||||
if len(leaderIDs) == 1 {
|
||||
for lid := range leaderIDs {
|
||||
r = append(r, inspector.Pass("rqlite.leader_agreement", "All nodes agree on leader", rqliteSub, "",
|
||||
fmt.Sprintf("leader_id=%s", lid), inspector.Critical))
|
||||
}
|
||||
} else {
|
||||
var parts []string
|
||||
for lid, names := range leaderIDs {
|
||||
id := lid
|
||||
if id == "" {
|
||||
id = "(none)"
|
||||
}
|
||||
parts = append(parts, fmt.Sprintf("%s: %s", id, strings.Join(names, ",")))
|
||||
}
|
||||
r = append(r, inspector.Fail("rqlite.leader_agreement", "All nodes agree on leader", rqliteSub, "",
|
||||
"leader disagreement: "+strings.Join(parts, "; "), inspector.Critical))
|
||||
}
|
||||
|
||||
// 1.38 Applied index convergence
|
||||
var minApplied, maxApplied uint64
|
||||
hasApplied := false
|
||||
for _, n := range nodes {
|
||||
idx := n.status.AppliedIndex
|
||||
if idx == 0 {
|
||||
continue
|
||||
}
|
||||
if !hasApplied {
|
||||
minApplied = idx
|
||||
maxApplied = idx
|
||||
hasApplied = true
|
||||
continue
|
||||
}
|
||||
if idx < minApplied {
|
||||
minApplied = idx
|
||||
}
|
||||
if idx > maxApplied {
|
||||
maxApplied = idx
|
||||
}
|
||||
}
|
||||
if hasApplied && maxApplied > 0 {
|
||||
gap := maxApplied - minApplied
|
||||
if gap < 100 {
|
||||
r = append(r, inspector.Pass("rqlite.index_convergence", "Applied index convergence", rqliteSub, "",
|
||||
fmt.Sprintf("min=%d max=%d gap=%d", minApplied, maxApplied, gap), inspector.Critical))
|
||||
} else if gap < 1000 {
|
||||
r = append(r, inspector.Warn("rqlite.index_convergence", "Applied index convergence", rqliteSub, "",
|
||||
fmt.Sprintf("min=%d max=%d gap=%d (lagging)", minApplied, maxApplied, gap), inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("rqlite.index_convergence", "Applied index convergence", rqliteSub, "",
|
||||
fmt.Sprintf("min=%d max=%d gap=%d (severely behind)", minApplied, maxApplied, gap), inspector.Critical))
|
||||
}
|
||||
}
|
||||
|
||||
// 1.35 Version consistency
|
||||
versions := map[string][]string{}
|
||||
for _, n := range nodes {
|
||||
if n.status.Version != "" {
|
||||
versions[n.status.Version] = append(versions[n.status.Version], n.name)
|
||||
}
|
||||
}
|
||||
if len(versions) == 1 {
|
||||
for v := range versions {
|
||||
r = append(r, inspector.Pass("rqlite.version_consistent", "Version consistent across nodes", rqliteSub, "",
|
||||
fmt.Sprintf("version=%s", v), inspector.Medium))
|
||||
}
|
||||
} else if len(versions) > 1 {
|
||||
var parts []string
|
||||
for v, names := range versions {
|
||||
parts = append(parts, fmt.Sprintf("%s: %s", v, strings.Join(names, ",")))
|
||||
}
|
||||
r = append(r, inspector.Warn("rqlite.version_consistent", "Version consistent across nodes", rqliteSub, "",
|
||||
"version mismatch: "+strings.Join(parts, "; "), inspector.Medium))
|
||||
}
|
||||
|
||||
// 1.40 Database size convergence
|
||||
type sizeEntry struct {
|
||||
name string
|
||||
size int64
|
||||
}
|
||||
var sizes []sizeEntry
|
||||
for _, n := range nodes {
|
||||
if n.status.DBSize > 0 {
|
||||
sizes = append(sizes, sizeEntry{n.name, n.status.DBSize})
|
||||
}
|
||||
}
|
||||
if len(sizes) >= 2 {
|
||||
minSize := sizes[0].size
|
||||
maxSize := sizes[0].size
|
||||
for _, s := range sizes[1:] {
|
||||
if s.size < minSize {
|
||||
minSize = s.size
|
||||
}
|
||||
if s.size > maxSize {
|
||||
maxSize = s.size
|
||||
}
|
||||
}
|
||||
if minSize > 0 {
|
||||
ratio := float64(maxSize) / float64(minSize)
|
||||
if ratio <= 1.05 {
|
||||
r = append(r, inspector.Pass("rqlite.db_size_convergence", "Database size converged", rqliteSub, "",
|
||||
fmt.Sprintf("min=%dB max=%dB ratio=%.2f", minSize, maxSize, ratio), inspector.Medium))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("rqlite.db_size_convergence", "Database size converged", rqliteSub, "",
|
||||
fmt.Sprintf("min=%dB max=%dB ratio=%.2f (diverged)", minSize, maxSize, ratio), inspector.High))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 1.42 Quorum math
|
||||
voters := 0
|
||||
reachableVoters := 0
|
||||
for _, n := range nodes {
|
||||
if n.status.Voter {
|
||||
voters++
|
||||
reachableVoters++ // responded to SSH + curl = reachable
|
||||
}
|
||||
}
|
||||
quorumNeeded := int(math.Floor(float64(voters)/2)) + 1
|
||||
if reachableVoters >= quorumNeeded {
|
||||
r = append(r, inspector.Pass("rqlite.quorum", "Quorum maintained", rqliteSub, "",
|
||||
fmt.Sprintf("reachable_voters=%d/%d quorum_needed=%d", reachableVoters, voters, quorumNeeded), inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("rqlite.quorum", "Quorum maintained", rqliteSub, "",
|
||||
fmt.Sprintf("reachable_voters=%d/%d quorum_needed=%d (QUORUM LOST)", reachableVoters, voters, quorumNeeded), inspector.Critical))
|
||||
}
|
||||
|
||||
return r
|
||||
}
|
||||
|
||||
// countRQLiteNodes counts nodes that have RQLite data.
|
||||
func countRQLiteNodes(data *inspector.ClusterData) int {
|
||||
count := 0
|
||||
for _, nd := range data.Nodes {
|
||||
if nd.RQLite != nil {
|
||||
count++
|
||||
}
|
||||
}
|
||||
return count
|
||||
}
|
||||
401
pkg/inspector/checks/rqlite_test.go
Normal file
401
pkg/inspector/checks/rqlite_test.go
Normal file
@ -0,0 +1,401 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/inspector"
|
||||
)
|
||||
|
||||
func TestCheckRQLite_Unresponsive(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.RQLite = &inspector.RQLiteData{Responsive: false}
|
||||
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckRQLite(data)
|
||||
|
||||
expectStatus(t, results, "rqlite.responsive", inspector.StatusFail)
|
||||
// Should return early — no raft_state check
|
||||
if findCheck(results, "rqlite.raft_state") != nil {
|
||||
t.Error("should not check raft_state when unresponsive")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckRQLite_HealthyLeader(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.RQLite = &inspector.RQLiteData{
|
||||
Responsive: true,
|
||||
StrongRead: true,
|
||||
Readyz: &inspector.RQLiteReadyz{Ready: true, Node: "ready", Leader: "ready", Store: "ready"},
|
||||
Status: &inspector.RQLiteStatus{
|
||||
RaftState: "Leader",
|
||||
LeaderNodeID: "node1",
|
||||
Voter: true,
|
||||
NumPeers: 2,
|
||||
Term: 5,
|
||||
CommitIndex: 1000,
|
||||
AppliedIndex: 1000,
|
||||
FsmPending: 0,
|
||||
LastLogTerm: 5,
|
||||
DBAppliedIndex: 1000,
|
||||
FsmIndex: 1000,
|
||||
LastSnapshot: 995,
|
||||
DBSizeFriendly: "1.2MB",
|
||||
Goroutines: 50,
|
||||
HeapAlloc: 100 * 1024 * 1024, // 100MB
|
||||
Version: "8.0.0",
|
||||
},
|
||||
Nodes: map[string]*inspector.RQLiteNode{
|
||||
"node1:5001": {Addr: "node1:5001", Reachable: true, Leader: true, Voter: true},
|
||||
"node2:5001": {Addr: "node2:5001", Reachable: true, Leader: false, Voter: true},
|
||||
"node3:5001": {Addr: "node3:5001", Reachable: true, Leader: false, Voter: true},
|
||||
},
|
||||
DebugVars: &inspector.RQLiteDebugVars{},
|
||||
}
|
||||
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckRQLite(data)
|
||||
|
||||
expectStatus(t, results, "rqlite.responsive", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.readyz", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.raft_state", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.leader_known", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.voter", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.commit_applied_gap", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.fsm_pending", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.db_fsm_sync", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.strong_read", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.all_reachable", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.goroutines", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.memory", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.query_errors", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.execute_errors", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.leader_not_found", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.snapshot_errors", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.client_health", inspector.StatusPass)
|
||||
}
|
||||
|
||||
func TestCheckRQLite_RaftStates(t *testing.T) {
|
||||
tests := []struct {
|
||||
state string
|
||||
status inspector.Status
|
||||
}{
|
||||
{"Leader", inspector.StatusPass},
|
||||
{"Follower", inspector.StatusPass},
|
||||
{"Candidate", inspector.StatusWarn},
|
||||
{"Shutdown", inspector.StatusFail},
|
||||
{"Unknown", inspector.StatusFail},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.state, func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.RQLite = &inspector.RQLiteData{
|
||||
Responsive: true,
|
||||
Status: &inspector.RQLiteStatus{
|
||||
RaftState: tt.state,
|
||||
LeaderNodeID: "node1",
|
||||
Voter: true,
|
||||
},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckRQLite(data)
|
||||
expectStatus(t, results, "rqlite.raft_state", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckRQLite_ReadyzFail(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.RQLite = &inspector.RQLiteData{
|
||||
Responsive: true,
|
||||
Readyz: &inspector.RQLiteReadyz{Ready: false, Node: "ready", Leader: "not ready", Store: "ready"},
|
||||
Status: &inspector.RQLiteStatus{RaftState: "Follower", LeaderNodeID: "n1", Voter: true},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckRQLite(data)
|
||||
expectStatus(t, results, "rqlite.readyz", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckRQLite_CommitAppliedGap(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
commit uint64
|
||||
applied uint64
|
||||
status inspector.Status
|
||||
}{
|
||||
{"no gap", 1000, 1000, inspector.StatusPass},
|
||||
{"small gap", 1002, 1000, inspector.StatusPass},
|
||||
{"lagging", 1050, 1000, inspector.StatusWarn},
|
||||
{"severely behind", 2000, 1000, inspector.StatusFail},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.RQLite = &inspector.RQLiteData{
|
||||
Responsive: true,
|
||||
Status: &inspector.RQLiteStatus{
|
||||
RaftState: "Follower",
|
||||
LeaderNodeID: "n1",
|
||||
Voter: true,
|
||||
CommitIndex: tt.commit,
|
||||
AppliedIndex: tt.applied,
|
||||
},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckRQLite(data)
|
||||
expectStatus(t, results, "rqlite.commit_applied_gap", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckRQLite_FsmPending(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
pending uint64
|
||||
status inspector.Status
|
||||
}{
|
||||
{"zero", 0, inspector.StatusPass},
|
||||
{"small", 5, inspector.StatusWarn},
|
||||
{"backlog", 100, inspector.StatusFail},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.RQLite = &inspector.RQLiteData{
|
||||
Responsive: true,
|
||||
Status: &inspector.RQLiteStatus{
|
||||
RaftState: "Follower",
|
||||
LeaderNodeID: "n1",
|
||||
Voter: true,
|
||||
FsmPending: tt.pending,
|
||||
},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckRQLite(data)
|
||||
expectStatus(t, results, "rqlite.fsm_pending", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckRQLite_StrongReadFail(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.RQLite = &inspector.RQLiteData{
|
||||
Responsive: true,
|
||||
StrongRead: false,
|
||||
Status: &inspector.RQLiteStatus{RaftState: "Follower", LeaderNodeID: "n1", Voter: true},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckRQLite(data)
|
||||
expectStatus(t, results, "rqlite.strong_read", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckRQLite_DebugVarsErrors(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.RQLite = &inspector.RQLiteData{
|
||||
Responsive: true,
|
||||
Status: &inspector.RQLiteStatus{RaftState: "Leader", LeaderNodeID: "n1", Voter: true},
|
||||
DebugVars: &inspector.RQLiteDebugVars{
|
||||
QueryErrors: 5,
|
||||
ExecuteErrors: 3,
|
||||
LeaderNotFound: 1,
|
||||
SnapshotErrors: 2,
|
||||
ClientRetries: 10,
|
||||
ClientTimeouts: 1,
|
||||
},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckRQLite(data)
|
||||
|
||||
expectStatus(t, results, "rqlite.query_errors", inspector.StatusWarn)
|
||||
expectStatus(t, results, "rqlite.execute_errors", inspector.StatusWarn)
|
||||
expectStatus(t, results, "rqlite.leader_not_found", inspector.StatusFail)
|
||||
expectStatus(t, results, "rqlite.snapshot_errors", inspector.StatusFail)
|
||||
expectStatus(t, results, "rqlite.client_health", inspector.StatusWarn)
|
||||
}
|
||||
|
||||
func TestCheckRQLite_Goroutines(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
goroutines int
|
||||
status inspector.Status
|
||||
}{
|
||||
{"healthy", 50, inspector.StatusPass},
|
||||
{"elevated", 500, inspector.StatusWarn},
|
||||
{"high", 2000, inspector.StatusFail},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.RQLite = &inspector.RQLiteData{
|
||||
Responsive: true,
|
||||
Status: &inspector.RQLiteStatus{
|
||||
RaftState: "Leader",
|
||||
LeaderNodeID: "n1",
|
||||
Voter: true,
|
||||
Goroutines: tt.goroutines,
|
||||
},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckRQLite(data)
|
||||
expectStatus(t, results, "rqlite.goroutines", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// --- Cross-node tests ---
|
||||
|
||||
func makeRQLiteCluster(leaderHost string, states map[string]string, term uint64) *inspector.ClusterData {
|
||||
nodes := map[string]*inspector.NodeData{}
|
||||
rqliteNodes := map[string]*inspector.RQLiteNode{}
|
||||
for host := range states {
|
||||
rqliteNodes[host+":5001"] = &inspector.RQLiteNode{
|
||||
Addr: host + ":5001", Reachable: true, Voter: true,
|
||||
Leader: states[host] == "Leader",
|
||||
}
|
||||
}
|
||||
|
||||
for host, state := range states {
|
||||
nd := makeNodeData(host, "node")
|
||||
nd.RQLite = &inspector.RQLiteData{
|
||||
Responsive: true,
|
||||
Status: &inspector.RQLiteStatus{
|
||||
RaftState: state,
|
||||
LeaderNodeID: leaderHost,
|
||||
Voter: true,
|
||||
Term: term,
|
||||
AppliedIndex: 1000,
|
||||
CommitIndex: 1000,
|
||||
Version: "8.0.0",
|
||||
DBSize: 4096,
|
||||
},
|
||||
Nodes: rqliteNodes,
|
||||
}
|
||||
nodes[host] = nd
|
||||
}
|
||||
return makeCluster(nodes)
|
||||
}
|
||||
|
||||
func TestCheckRQLite_CrossNode_SingleLeader(t *testing.T) {
|
||||
data := makeRQLiteCluster("1.1.1.1", map[string]string{
|
||||
"1.1.1.1": "Leader",
|
||||
"2.2.2.2": "Follower",
|
||||
"3.3.3.3": "Follower",
|
||||
}, 5)
|
||||
|
||||
results := CheckRQLite(data)
|
||||
expectStatus(t, results, "rqlite.single_leader", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.term_consistent", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.leader_agreement", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.index_convergence", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.version_consistent", inspector.StatusPass)
|
||||
expectStatus(t, results, "rqlite.quorum", inspector.StatusPass)
|
||||
}
|
||||
|
||||
func TestCheckRQLite_CrossNode_NoLeader(t *testing.T) {
|
||||
data := makeRQLiteCluster("", map[string]string{
|
||||
"1.1.1.1": "Candidate",
|
||||
"2.2.2.2": "Candidate",
|
||||
"3.3.3.3": "Candidate",
|
||||
}, 5)
|
||||
results := CheckRQLite(data)
|
||||
expectStatus(t, results, "rqlite.single_leader", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckRQLite_CrossNode_SplitBrain(t *testing.T) {
|
||||
nodes := map[string]*inspector.NodeData{}
|
||||
for _, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
|
||||
nd := makeNodeData(host, "node")
|
||||
state := "Follower"
|
||||
leaderID := "1.1.1.1"
|
||||
if host == "1.1.1.1" || host == "2.2.2.2" {
|
||||
state = "Leader"
|
||||
leaderID = host
|
||||
}
|
||||
nd.RQLite = &inspector.RQLiteData{
|
||||
Responsive: true,
|
||||
Status: &inspector.RQLiteStatus{
|
||||
RaftState: state,
|
||||
LeaderNodeID: leaderID,
|
||||
Voter: true,
|
||||
Term: 5,
|
||||
AppliedIndex: 1000,
|
||||
},
|
||||
}
|
||||
nodes[host] = nd
|
||||
}
|
||||
data := makeCluster(nodes)
|
||||
results := CheckRQLite(data)
|
||||
expectStatus(t, results, "rqlite.single_leader", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckRQLite_CrossNode_TermDivergence(t *testing.T) {
|
||||
nodes := map[string]*inspector.NodeData{}
|
||||
terms := map[string]uint64{"1.1.1.1": 5, "2.2.2.2": 5, "3.3.3.3": 6}
|
||||
for host, term := range terms {
|
||||
nd := makeNodeData(host, "node")
|
||||
nd.RQLite = &inspector.RQLiteData{
|
||||
Responsive: true,
|
||||
Status: &inspector.RQLiteStatus{
|
||||
RaftState: "Follower",
|
||||
LeaderNodeID: "1.1.1.1",
|
||||
Voter: true,
|
||||
Term: term,
|
||||
AppliedIndex: 1000,
|
||||
},
|
||||
}
|
||||
nodes[host] = nd
|
||||
}
|
||||
data := makeCluster(nodes)
|
||||
results := CheckRQLite(data)
|
||||
expectStatus(t, results, "rqlite.term_consistent", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckRQLite_CrossNode_IndexLagging(t *testing.T) {
|
||||
nodes := map[string]*inspector.NodeData{}
|
||||
applied := map[string]uint64{"1.1.1.1": 1000, "2.2.2.2": 1000, "3.3.3.3": 500}
|
||||
for host, idx := range applied {
|
||||
nd := makeNodeData(host, "node")
|
||||
state := "Follower"
|
||||
if host == "1.1.1.1" {
|
||||
state = "Leader"
|
||||
}
|
||||
nd.RQLite = &inspector.RQLiteData{
|
||||
Responsive: true,
|
||||
Status: &inspector.RQLiteStatus{
|
||||
RaftState: state,
|
||||
LeaderNodeID: "1.1.1.1",
|
||||
Voter: true,
|
||||
Term: 5,
|
||||
AppliedIndex: idx,
|
||||
CommitIndex: idx,
|
||||
},
|
||||
}
|
||||
nodes[host] = nd
|
||||
}
|
||||
data := makeCluster(nodes)
|
||||
results := CheckRQLite(data)
|
||||
expectStatus(t, results, "rqlite.index_convergence", inspector.StatusWarn)
|
||||
}
|
||||
|
||||
func TestCheckRQLite_CrossNode_SkipSingleNode(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.RQLite = &inspector.RQLiteData{
|
||||
Responsive: true,
|
||||
Status: &inspector.RQLiteStatus{RaftState: "Leader", LeaderNodeID: "n1", Voter: true, Term: 5, AppliedIndex: 1000},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckRQLite(data)
|
||||
expectStatus(t, results, "rqlite.cross_node", inspector.StatusSkip)
|
||||
}
|
||||
|
||||
func TestCheckRQLite_NilRQLiteData(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
// nd.RQLite is nil — no per-node checks, but cross-node skip is expected
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckRQLite(data)
|
||||
// Should only have the cross-node skip (not enough nodes)
|
||||
for _, r := range results {
|
||||
if r.Status != inspector.StatusSkip {
|
||||
t.Errorf("unexpected non-skip result: %s (status=%s)", r.ID, r.Status)
|
||||
}
|
||||
}
|
||||
}
|
||||
242
pkg/inspector/checks/system.go
Normal file
242
pkg/inspector/checks/system.go
Normal file
@ -0,0 +1,242 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/inspector"
|
||||
)
|
||||
|
||||
func init() {
|
||||
inspector.RegisterChecker("system", CheckSystem)
|
||||
}
|
||||
|
||||
const systemSub = "system"
|
||||
|
||||
// CheckSystem runs all system-level health checks.
|
||||
func CheckSystem(data *inspector.ClusterData) []inspector.CheckResult {
|
||||
var results []inspector.CheckResult
|
||||
|
||||
for _, nd := range data.Nodes {
|
||||
if nd.System == nil {
|
||||
continue
|
||||
}
|
||||
results = append(results, checkSystemPerNode(nd)...)
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
func checkSystemPerNode(nd *inspector.NodeData) []inspector.CheckResult {
|
||||
var r []inspector.CheckResult
|
||||
sys := nd.System
|
||||
node := nd.Node.Name()
|
||||
|
||||
// 6.1 Core services active
|
||||
coreServices := []string{"debros-node", "debros-olric", "debros-ipfs", "debros-ipfs-cluster"}
|
||||
for _, svc := range coreServices {
|
||||
status, ok := sys.Services[svc]
|
||||
if !ok {
|
||||
status = "unknown"
|
||||
}
|
||||
id := fmt.Sprintf("system.svc_%s", strings.ReplaceAll(svc, "-", "_"))
|
||||
name := fmt.Sprintf("%s service active", svc)
|
||||
if status == "active" {
|
||||
r = append(r, inspector.Pass(id, name, systemSub, node, "active", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail(id, name, systemSub, node,
|
||||
fmt.Sprintf("status=%s", status), inspector.Critical))
|
||||
}
|
||||
}
|
||||
|
||||
// 6.5 WireGuard service
|
||||
if status, ok := sys.Services["wg-quick@wg0"]; ok {
|
||||
if status == "active" {
|
||||
r = append(r, inspector.Pass("system.svc_wg", "wg-quick@wg0 active", systemSub, node, "active", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("system.svc_wg", "wg-quick@wg0 active", systemSub, node,
|
||||
fmt.Sprintf("status=%s", status), inspector.Critical))
|
||||
}
|
||||
}
|
||||
|
||||
// 6.3 Nameserver services (if applicable)
|
||||
if nd.Node.IsNameserver() {
|
||||
for _, svc := range []string{"coredns", "caddy"} {
|
||||
status, ok := sys.Services[svc]
|
||||
if !ok {
|
||||
status = "unknown"
|
||||
}
|
||||
id := fmt.Sprintf("system.svc_%s", svc)
|
||||
name := fmt.Sprintf("%s service active", svc)
|
||||
if status == "active" {
|
||||
r = append(r, inspector.Pass(id, name, systemSub, node, "active", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail(id, name, systemSub, node,
|
||||
fmt.Sprintf("status=%s", status), inspector.Critical))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 6.6 Failed systemd units
|
||||
if len(sys.FailedUnits) == 0 {
|
||||
r = append(r, inspector.Pass("system.no_failed_units", "No failed systemd units", systemSub, node,
|
||||
"no failed units", inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("system.no_failed_units", "No failed systemd units", systemSub, node,
|
||||
fmt.Sprintf("failed: %s", strings.Join(sys.FailedUnits, ", ")), inspector.High))
|
||||
}
|
||||
|
||||
// 6.14 Memory usage
|
||||
if sys.MemTotalMB > 0 {
|
||||
pct := float64(sys.MemUsedMB) / float64(sys.MemTotalMB) * 100
|
||||
if pct < 80 {
|
||||
r = append(r, inspector.Pass("system.memory", "Memory usage healthy", systemSub, node,
|
||||
fmt.Sprintf("used=%dMB/%dMB (%.0f%%)", sys.MemUsedMB, sys.MemTotalMB, pct), inspector.Medium))
|
||||
} else if pct < 90 {
|
||||
r = append(r, inspector.Warn("system.memory", "Memory usage healthy", systemSub, node,
|
||||
fmt.Sprintf("used=%dMB/%dMB (%.0f%%)", sys.MemUsedMB, sys.MemTotalMB, pct), inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("system.memory", "Memory usage healthy", systemSub, node,
|
||||
fmt.Sprintf("used=%dMB/%dMB (%.0f%% CRITICAL)", sys.MemUsedMB, sys.MemTotalMB, pct), inspector.Critical))
|
||||
}
|
||||
}
|
||||
|
||||
// 6.15 Disk usage
|
||||
if sys.DiskUsePct > 0 {
|
||||
if sys.DiskUsePct < 80 {
|
||||
r = append(r, inspector.Pass("system.disk", "Disk usage healthy", systemSub, node,
|
||||
fmt.Sprintf("used=%s/%s (%d%%)", sys.DiskUsedGB, sys.DiskTotalGB, sys.DiskUsePct), inspector.High))
|
||||
} else if sys.DiskUsePct < 90 {
|
||||
r = append(r, inspector.Warn("system.disk", "Disk usage healthy", systemSub, node,
|
||||
fmt.Sprintf("used=%s/%s (%d%%)", sys.DiskUsedGB, sys.DiskTotalGB, sys.DiskUsePct), inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("system.disk", "Disk usage healthy", systemSub, node,
|
||||
fmt.Sprintf("used=%s/%s (%d%% CRITICAL)", sys.DiskUsedGB, sys.DiskTotalGB, sys.DiskUsePct), inspector.Critical))
|
||||
}
|
||||
}
|
||||
|
||||
// 6.17 Load average vs CPU count
|
||||
if sys.LoadAvg != "" && sys.CPUCount > 0 {
|
||||
parts := strings.Split(strings.TrimSpace(sys.LoadAvg), ",")
|
||||
if len(parts) >= 1 {
|
||||
load1, err := strconv.ParseFloat(strings.TrimSpace(parts[0]), 64)
|
||||
if err == nil {
|
||||
cpus := float64(sys.CPUCount)
|
||||
if load1 < cpus {
|
||||
r = append(r, inspector.Pass("system.load", "Load average healthy", systemSub, node,
|
||||
fmt.Sprintf("load1=%.1f cpus=%d", load1, sys.CPUCount), inspector.Medium))
|
||||
} else if load1 < cpus*2 {
|
||||
r = append(r, inspector.Warn("system.load", "Load average healthy", systemSub, node,
|
||||
fmt.Sprintf("load1=%.1f cpus=%d (elevated)", load1, sys.CPUCount), inspector.Medium))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("system.load", "Load average healthy", systemSub, node,
|
||||
fmt.Sprintf("load1=%.1f cpus=%d (overloaded)", load1, sys.CPUCount), inspector.High))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 6.18 OOM kills
|
||||
if sys.OOMKills == 0 {
|
||||
r = append(r, inspector.Pass("system.oom", "No OOM kills", systemSub, node,
|
||||
"no OOM kills in dmesg", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("system.oom", "No OOM kills", systemSub, node,
|
||||
fmt.Sprintf("%d OOM kills in dmesg", sys.OOMKills), inspector.Critical))
|
||||
}
|
||||
|
||||
// 6.19 Swap usage
|
||||
if sys.SwapTotalMB > 0 {
|
||||
pct := float64(sys.SwapUsedMB) / float64(sys.SwapTotalMB) * 100
|
||||
if pct < 30 {
|
||||
r = append(r, inspector.Pass("system.swap", "Swap usage low", systemSub, node,
|
||||
fmt.Sprintf("swap=%dMB/%dMB (%.0f%%)", sys.SwapUsedMB, sys.SwapTotalMB, pct), inspector.Medium))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("system.swap", "Swap usage low", systemSub, node,
|
||||
fmt.Sprintf("swap=%dMB/%dMB (%.0f%%)", sys.SwapUsedMB, sys.SwapTotalMB, pct), inspector.Medium))
|
||||
}
|
||||
}
|
||||
|
||||
// 6.20 Uptime
|
||||
if sys.UptimeRaw != "" && sys.UptimeRaw != "unknown" {
|
||||
r = append(r, inspector.Pass("system.uptime", "System uptime reported", systemSub, node,
|
||||
fmt.Sprintf("up since %s", sys.UptimeRaw), inspector.Low))
|
||||
}
|
||||
|
||||
// 6.21 Inode usage
|
||||
if sys.InodePct > 0 {
|
||||
if sys.InodePct < 80 {
|
||||
r = append(r, inspector.Pass("system.inodes", "Inode usage healthy", systemSub, node,
|
||||
fmt.Sprintf("inode_use=%d%%", sys.InodePct), inspector.High))
|
||||
} else if sys.InodePct < 95 {
|
||||
r = append(r, inspector.Warn("system.inodes", "Inode usage healthy", systemSub, node,
|
||||
fmt.Sprintf("inode_use=%d%% (elevated)", sys.InodePct), inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("system.inodes", "Inode usage healthy", systemSub, node,
|
||||
fmt.Sprintf("inode_use=%d%% (CRITICAL)", sys.InodePct), inspector.Critical))
|
||||
}
|
||||
}
|
||||
|
||||
// 6.22 UFW firewall
|
||||
if sys.UFWActive {
|
||||
r = append(r, inspector.Pass("system.ufw", "UFW firewall active", systemSub, node,
|
||||
"ufw is active", inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("system.ufw", "UFW firewall active", systemSub, node,
|
||||
"ufw is not active", inspector.High))
|
||||
}
|
||||
|
||||
// 6.23 Process user
|
||||
if sys.ProcessUser != "" && sys.ProcessUser != "unknown" {
|
||||
if sys.ProcessUser == "debros" {
|
||||
r = append(r, inspector.Pass("system.process_user", "debros-node runs as correct user", systemSub, node,
|
||||
"user=debros", inspector.High))
|
||||
} else if sys.ProcessUser == "root" {
|
||||
r = append(r, inspector.Warn("system.process_user", "debros-node runs as correct user", systemSub, node,
|
||||
"user=root (should be debros)", inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("system.process_user", "debros-node runs as correct user", systemSub, node,
|
||||
fmt.Sprintf("user=%s (expected debros)", sys.ProcessUser), inspector.Medium))
|
||||
}
|
||||
}
|
||||
|
||||
// 6.24 Panic/fatal in logs
|
||||
if sys.PanicCount == 0 {
|
||||
r = append(r, inspector.Pass("system.panics", "No panics in recent logs", systemSub, node,
|
||||
"0 panic/fatal in last hour", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("system.panics", "No panics in recent logs", systemSub, node,
|
||||
fmt.Sprintf("%d panic/fatal in last hour", sys.PanicCount), inspector.Critical))
|
||||
}
|
||||
|
||||
// 6.25 Expected ports listening
|
||||
expectedPorts := map[int]string{
|
||||
5001: "RQLite HTTP",
|
||||
3322: "Olric Memberlist",
|
||||
6001: "Gateway",
|
||||
4501: "IPFS API",
|
||||
}
|
||||
for port, svcName := range expectedPorts {
|
||||
found := false
|
||||
for _, p := range sys.ListeningPorts {
|
||||
if p == port {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if found {
|
||||
r = append(r, inspector.Pass(
|
||||
fmt.Sprintf("system.port_%d", port),
|
||||
fmt.Sprintf("%s port %d listening", svcName, port),
|
||||
systemSub, node, "port is bound", inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Warn(
|
||||
fmt.Sprintf("system.port_%d", port),
|
||||
fmt.Sprintf("%s port %d listening", svcName, port),
|
||||
systemSub, node, "port is NOT bound", inspector.High))
|
||||
}
|
||||
}
|
||||
|
||||
return r
|
||||
}
|
||||
284
pkg/inspector/checks/system_test.go
Normal file
284
pkg/inspector/checks/system_test.go
Normal file
@ -0,0 +1,284 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/inspector"
|
||||
)
|
||||
|
||||
func TestCheckSystem_HealthyNode(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.System = &inspector.SystemData{
|
||||
Services: map[string]string{
|
||||
"debros-node": "active",
|
||||
"debros-olric": "active",
|
||||
"debros-ipfs": "active",
|
||||
"debros-ipfs-cluster": "active",
|
||||
"wg-quick@wg0": "active",
|
||||
},
|
||||
FailedUnits: nil,
|
||||
MemTotalMB: 8192,
|
||||
MemUsedMB: 4096,
|
||||
DiskUsePct: 50,
|
||||
DiskUsedGB: "25G",
|
||||
DiskTotalGB: "50G",
|
||||
LoadAvg: "1.0, 0.8, 0.5",
|
||||
CPUCount: 4,
|
||||
OOMKills: 0,
|
||||
SwapTotalMB: 2048,
|
||||
SwapUsedMB: 100,
|
||||
UptimeRaw: "2024-01-01 00:00:00",
|
||||
InodePct: 10,
|
||||
ListeningPorts: []int{5001, 3322, 6001, 4501},
|
||||
UFWActive: true,
|
||||
ProcessUser: "debros",
|
||||
PanicCount: 0,
|
||||
}
|
||||
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckSystem(data)
|
||||
|
||||
expectStatus(t, results, "system.svc_debros_node", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.svc_debros_olric", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.svc_debros_ipfs", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.svc_debros_ipfs_cluster", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.svc_wg", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.no_failed_units", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.memory", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.disk", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.load", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.oom", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.swap", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.inodes", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.ufw", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.process_user", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.panics", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.port_5001", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.port_3322", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.port_6001", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.port_4501", inspector.StatusPass)
|
||||
}
|
||||
|
||||
func TestCheckSystem_ServiceInactive(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.System = &inspector.SystemData{
|
||||
Services: map[string]string{
|
||||
"debros-node": "active",
|
||||
"debros-olric": "inactive",
|
||||
"debros-ipfs": "active",
|
||||
"debros-ipfs-cluster": "failed",
|
||||
},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckSystem(data)
|
||||
|
||||
expectStatus(t, results, "system.svc_debros_node", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.svc_debros_olric", inspector.StatusFail)
|
||||
expectStatus(t, results, "system.svc_debros_ipfs_cluster", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckSystem_NameserverServices(t *testing.T) {
|
||||
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
|
||||
nd.System = &inspector.SystemData{
|
||||
Services: map[string]string{
|
||||
"debros-node": "active",
|
||||
"debros-olric": "active",
|
||||
"debros-ipfs": "active",
|
||||
"debros-ipfs-cluster": "active",
|
||||
"coredns": "active",
|
||||
"caddy": "active",
|
||||
},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
|
||||
results := CheckSystem(data)
|
||||
expectStatus(t, results, "system.svc_coredns", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.svc_caddy", inspector.StatusPass)
|
||||
}
|
||||
|
||||
func TestCheckSystem_NameserverServicesNotCheckedOnRegularNode(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.System = &inspector.SystemData{
|
||||
Services: map[string]string{
|
||||
"debros-node": "active",
|
||||
"debros-olric": "active",
|
||||
"debros-ipfs": "active",
|
||||
"debros-ipfs-cluster": "active",
|
||||
},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckSystem(data)
|
||||
if findCheck(results, "system.svc_coredns") != nil {
|
||||
t.Error("should not check coredns on regular node")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckSystem_FailedUnits(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.System = &inspector.SystemData{
|
||||
Services: map[string]string{},
|
||||
FailedUnits: []string{"some-service.service"},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckSystem(data)
|
||||
expectStatus(t, results, "system.no_failed_units", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckSystem_Memory(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
used int
|
||||
total int
|
||||
status inspector.Status
|
||||
}{
|
||||
{"healthy", 4000, 8000, inspector.StatusPass}, // 50%
|
||||
{"elevated", 7000, 8000, inspector.StatusWarn}, // 87.5%
|
||||
{"critical", 7500, 8000, inspector.StatusFail}, // 93.75%
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.System = &inspector.SystemData{
|
||||
Services: map[string]string{},
|
||||
MemTotalMB: tt.total,
|
||||
MemUsedMB: tt.used,
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckSystem(data)
|
||||
expectStatus(t, results, "system.memory", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckSystem_Disk(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
pct int
|
||||
status inspector.Status
|
||||
}{
|
||||
{"healthy", 60, inspector.StatusPass},
|
||||
{"elevated", 85, inspector.StatusWarn},
|
||||
{"critical", 92, inspector.StatusFail},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.System = &inspector.SystemData{
|
||||
Services: map[string]string{},
|
||||
DiskUsePct: tt.pct,
|
||||
DiskUsedGB: "25G",
|
||||
DiskTotalGB: "50G",
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckSystem(data)
|
||||
expectStatus(t, results, "system.disk", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckSystem_Load(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
load string
|
||||
cpus int
|
||||
status inspector.Status
|
||||
}{
|
||||
{"healthy", "1.0, 0.8, 0.5", 4, inspector.StatusPass},
|
||||
{"elevated", "6.0, 5.0, 4.0", 4, inspector.StatusWarn},
|
||||
{"overloaded", "10.0, 9.0, 8.0", 4, inspector.StatusFail},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.System = &inspector.SystemData{
|
||||
Services: map[string]string{},
|
||||
LoadAvg: tt.load,
|
||||
CPUCount: tt.cpus,
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckSystem(data)
|
||||
expectStatus(t, results, "system.load", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckSystem_OOMKills(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.System = &inspector.SystemData{Services: map[string]string{}, OOMKills: 3}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckSystem(data)
|
||||
expectStatus(t, results, "system.oom", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckSystem_Inodes(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
pct int
|
||||
status inspector.Status
|
||||
}{
|
||||
{"healthy", 50, inspector.StatusPass},
|
||||
{"elevated", 82, inspector.StatusWarn},
|
||||
{"critical", 96, inspector.StatusFail},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.System = &inspector.SystemData{Services: map[string]string{}, InodePct: tt.pct}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckSystem(data)
|
||||
expectStatus(t, results, "system.inodes", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckSystem_ProcessUser(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
user string
|
||||
status inspector.Status
|
||||
}{
|
||||
{"correct", "debros", inspector.StatusPass},
|
||||
{"root", "root", inspector.StatusWarn},
|
||||
{"other", "ubuntu", inspector.StatusWarn},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.System = &inspector.SystemData{Services: map[string]string{}, ProcessUser: tt.user}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckSystem(data)
|
||||
expectStatus(t, results, "system.process_user", tt.status)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckSystem_Panics(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.System = &inspector.SystemData{Services: map[string]string{}, PanicCount: 5}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckSystem(data)
|
||||
expectStatus(t, results, "system.panics", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckSystem_ExpectedPorts(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.System = &inspector.SystemData{
|
||||
Services: map[string]string{},
|
||||
ListeningPorts: []int{5001, 6001}, // Missing 3322, 4501
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckSystem(data)
|
||||
|
||||
expectStatus(t, results, "system.port_5001", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.port_6001", inspector.StatusPass)
|
||||
expectStatus(t, results, "system.port_3322", inspector.StatusWarn)
|
||||
expectStatus(t, results, "system.port_4501", inspector.StatusWarn)
|
||||
}
|
||||
|
||||
func TestCheckSystem_NilData(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckSystem(data)
|
||||
if len(results) != 0 {
|
||||
t.Errorf("expected 0 results for nil System data, got %d", len(results))
|
||||
}
|
||||
}
|
||||
270
pkg/inspector/checks/wireguard.go
Normal file
270
pkg/inspector/checks/wireguard.go
Normal file
@ -0,0 +1,270 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/inspector"
|
||||
)
|
||||
|
||||
func init() {
|
||||
inspector.RegisterChecker("wireguard", CheckWireGuard)
|
||||
}
|
||||
|
||||
const wgSub = "wireguard"
|
||||
|
||||
// CheckWireGuard runs all WireGuard health checks.
|
||||
func CheckWireGuard(data *inspector.ClusterData) []inspector.CheckResult {
|
||||
var results []inspector.CheckResult
|
||||
|
||||
for _, nd := range data.Nodes {
|
||||
if nd.WireGuard == nil {
|
||||
continue
|
||||
}
|
||||
results = append(results, checkWGPerNode(nd, data)...)
|
||||
}
|
||||
|
||||
results = append(results, checkWGCrossNode(data)...)
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
func checkWGPerNode(nd *inspector.NodeData, data *inspector.ClusterData) []inspector.CheckResult {
|
||||
var r []inspector.CheckResult
|
||||
wg := nd.WireGuard
|
||||
node := nd.Node.Name()
|
||||
|
||||
// 5.1 Interface up
|
||||
if wg.InterfaceUp {
|
||||
r = append(r, inspector.Pass("wg.interface_up", "WireGuard interface up", wgSub, node,
|
||||
fmt.Sprintf("wg0 up, IP=%s", wg.WgIP), inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("wg.interface_up", "WireGuard interface up", wgSub, node,
|
||||
"wg0 interface is DOWN", inspector.Critical))
|
||||
return r
|
||||
}
|
||||
|
||||
// 5.2 Service active
|
||||
if wg.ServiceActive {
|
||||
r = append(r, inspector.Pass("wg.service_active", "wg-quick@wg0 service active", wgSub, node,
|
||||
"service is active", inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("wg.service_active", "wg-quick@wg0 service active", wgSub, node,
|
||||
"service not active (interface up but service not managed by systemd?)", inspector.High))
|
||||
}
|
||||
|
||||
// 5.5 Correct IP in 10.0.0.0/24
|
||||
if wg.WgIP != "" && strings.HasPrefix(wg.WgIP, "10.0.0.") {
|
||||
r = append(r, inspector.Pass("wg.correct_ip", "WG IP in expected range", wgSub, node,
|
||||
fmt.Sprintf("IP=%s (10.0.0.0/24)", wg.WgIP), inspector.Critical))
|
||||
} else if wg.WgIP != "" {
|
||||
r = append(r, inspector.Warn("wg.correct_ip", "WG IP in expected range", wgSub, node,
|
||||
fmt.Sprintf("IP=%s (not in 10.0.0.0/24)", wg.WgIP), inspector.High))
|
||||
}
|
||||
|
||||
// 5.4 Listen port
|
||||
if wg.ListenPort == 51820 {
|
||||
r = append(r, inspector.Pass("wg.listen_port", "Listen port is 51820", wgSub, node,
|
||||
"port=51820", inspector.Critical))
|
||||
} else if wg.ListenPort > 0 {
|
||||
r = append(r, inspector.Warn("wg.listen_port", "Listen port is 51820", wgSub, node,
|
||||
fmt.Sprintf("port=%d (expected 51820)", wg.ListenPort), inspector.High))
|
||||
}
|
||||
|
||||
// 5.7 Peer count
|
||||
expectedNodes := countWGNodes(data)
|
||||
expectedPeers := expectedNodes - 1
|
||||
if expectedPeers < 0 {
|
||||
expectedPeers = 0
|
||||
}
|
||||
if wg.PeerCount >= expectedPeers {
|
||||
r = append(r, inspector.Pass("wg.peer_count", "Peer count matches expected", wgSub, node,
|
||||
fmt.Sprintf("peers=%d (expected=%d)", wg.PeerCount, expectedPeers), inspector.High))
|
||||
} else if wg.PeerCount > 0 {
|
||||
r = append(r, inspector.Warn("wg.peer_count", "Peer count matches expected", wgSub, node,
|
||||
fmt.Sprintf("peers=%d (expected=%d)", wg.PeerCount, expectedPeers), inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("wg.peer_count", "Peer count matches expected", wgSub, node,
|
||||
fmt.Sprintf("peers=%d (isolated!)", wg.PeerCount), inspector.Critical))
|
||||
}
|
||||
|
||||
// 5.29 MTU
|
||||
if wg.MTU == 1420 {
|
||||
r = append(r, inspector.Pass("wg.mtu", "MTU is 1420", wgSub, node,
|
||||
"MTU=1420", inspector.High))
|
||||
} else if wg.MTU > 0 {
|
||||
r = append(r, inspector.Warn("wg.mtu", "MTU is 1420", wgSub, node,
|
||||
fmt.Sprintf("MTU=%d (expected 1420)", wg.MTU), inspector.High))
|
||||
}
|
||||
|
||||
// 5.35 Config file exists
|
||||
if wg.ConfigExists {
|
||||
r = append(r, inspector.Pass("wg.config_exists", "Config file exists", wgSub, node,
|
||||
"/etc/wireguard/wg0.conf present", inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("wg.config_exists", "Config file exists", wgSub, node,
|
||||
"/etc/wireguard/wg0.conf NOT found", inspector.High))
|
||||
}
|
||||
|
||||
// 5.36 Config permissions
|
||||
if wg.ConfigPerms == "600" {
|
||||
r = append(r, inspector.Pass("wg.config_perms", "Config file permissions 600", wgSub, node,
|
||||
"perms=600", inspector.Critical))
|
||||
} else if wg.ConfigPerms != "" && wg.ConfigPerms != "000" {
|
||||
r = append(r, inspector.Warn("wg.config_perms", "Config file permissions 600", wgSub, node,
|
||||
fmt.Sprintf("perms=%s (expected 600)", wg.ConfigPerms), inspector.Critical))
|
||||
}
|
||||
|
||||
// Per-peer checks
|
||||
now := time.Now().Unix()
|
||||
neverHandshaked := 0
|
||||
staleHandshakes := 0
|
||||
noTraffic := 0
|
||||
|
||||
for _, peer := range wg.Peers {
|
||||
// 5.20 Each peer has exactly one /32 allowed IP
|
||||
if !strings.Contains(peer.AllowedIPs, "/32") {
|
||||
r = append(r, inspector.Warn("wg.peer_allowed_ip", "Peer has /32 allowed IP", wgSub, node,
|
||||
fmt.Sprintf("peer %s...%s has allowed_ips=%s", peer.PublicKey[:8], peer.PublicKey[len(peer.PublicKey)-4:], peer.AllowedIPs), inspector.High))
|
||||
}
|
||||
|
||||
// 5.23 No peer has 0.0.0.0/0
|
||||
if strings.Contains(peer.AllowedIPs, "0.0.0.0/0") {
|
||||
r = append(r, inspector.Fail("wg.peer_catch_all", "No catch-all route peer", wgSub, node,
|
||||
fmt.Sprintf("peer %s...%s has 0.0.0.0/0 (route hijack!)", peer.PublicKey[:8], peer.PublicKey[len(peer.PublicKey)-4:]), inspector.Critical))
|
||||
}
|
||||
|
||||
// 5.11-5.12 Handshake freshness
|
||||
if peer.LatestHandshake == 0 {
|
||||
neverHandshaked++
|
||||
} else {
|
||||
age := now - peer.LatestHandshake
|
||||
if age > 300 {
|
||||
staleHandshakes++
|
||||
}
|
||||
}
|
||||
|
||||
// 5.13 Transfer stats
|
||||
if peer.TransferRx == 0 && peer.TransferTx == 0 {
|
||||
noTraffic++
|
||||
}
|
||||
}
|
||||
|
||||
if len(wg.Peers) > 0 {
|
||||
// 5.12 Never handshaked
|
||||
if neverHandshaked == 0 {
|
||||
r = append(r, inspector.Pass("wg.handshake_all", "All peers have handshaked", wgSub, node,
|
||||
fmt.Sprintf("%d/%d peers handshaked", len(wg.Peers), len(wg.Peers)), inspector.Critical))
|
||||
} else {
|
||||
r = append(r, inspector.Fail("wg.handshake_all", "All peers have handshaked", wgSub, node,
|
||||
fmt.Sprintf("%d/%d peers never handshaked", neverHandshaked, len(wg.Peers)), inspector.Critical))
|
||||
}
|
||||
|
||||
// 5.11 Stale handshakes
|
||||
if staleHandshakes == 0 {
|
||||
r = append(r, inspector.Pass("wg.handshake_fresh", "All handshakes recent (<5m)", wgSub, node,
|
||||
"all handshakes within 5 minutes", inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("wg.handshake_fresh", "All handshakes recent (<5m)", wgSub, node,
|
||||
fmt.Sprintf("%d/%d peers with stale handshake (>5m)", staleHandshakes, len(wg.Peers)), inspector.High))
|
||||
}
|
||||
|
||||
// 5.13 Transfer
|
||||
if noTraffic == 0 {
|
||||
r = append(r, inspector.Pass("wg.peer_traffic", "All peers have traffic", wgSub, node,
|
||||
fmt.Sprintf("%d/%d peers with traffic", len(wg.Peers), len(wg.Peers)), inspector.High))
|
||||
} else {
|
||||
r = append(r, inspector.Warn("wg.peer_traffic", "All peers have traffic", wgSub, node,
|
||||
fmt.Sprintf("%d/%d peers with zero traffic", noTraffic, len(wg.Peers)), inspector.High))
|
||||
}
|
||||
}
|
||||
|
||||
return r
|
||||
}
|
||||
|
||||
func checkWGCrossNode(data *inspector.ClusterData) []inspector.CheckResult {
|
||||
var r []inspector.CheckResult
|
||||
|
||||
type nodeInfo struct {
|
||||
name string
|
||||
wg *inspector.WireGuardData
|
||||
}
|
||||
var nodes []nodeInfo
|
||||
for _, nd := range data.Nodes {
|
||||
if nd.WireGuard != nil && nd.WireGuard.InterfaceUp {
|
||||
nodes = append(nodes, nodeInfo{name: nd.Node.Name(), wg: nd.WireGuard})
|
||||
}
|
||||
}
|
||||
|
||||
if len(nodes) < 2 {
|
||||
return r
|
||||
}
|
||||
|
||||
// 5.8 Peer count consistent
|
||||
counts := map[int]int{}
|
||||
for _, n := range nodes {
|
||||
counts[n.wg.PeerCount]++
|
||||
}
|
||||
if len(counts) == 1 {
|
||||
for c := range counts {
|
||||
r = append(r, inspector.Pass("wg.peer_count_consistent", "Peer count consistent across nodes", wgSub, "",
|
||||
fmt.Sprintf("all nodes have %d peers", c), inspector.High))
|
||||
}
|
||||
} else {
|
||||
var parts []string
|
||||
for c, num := range counts {
|
||||
parts = append(parts, fmt.Sprintf("%d nodes have %d peers", num, c))
|
||||
}
|
||||
r = append(r, inspector.Warn("wg.peer_count_consistent", "Peer count consistent across nodes", wgSub, "",
|
||||
strings.Join(parts, "; "), inspector.High))
|
||||
}
|
||||
|
||||
// 5.30 MTU consistent
|
||||
mtus := map[int]int{}
|
||||
for _, n := range nodes {
|
||||
if n.wg.MTU > 0 {
|
||||
mtus[n.wg.MTU]++
|
||||
}
|
||||
}
|
||||
if len(mtus) == 1 {
|
||||
for m := range mtus {
|
||||
r = append(r, inspector.Pass("wg.mtu_consistent", "MTU consistent across nodes", wgSub, "",
|
||||
fmt.Sprintf("all nodes MTU=%d", m), inspector.High))
|
||||
}
|
||||
} else if len(mtus) > 1 {
|
||||
r = append(r, inspector.Warn("wg.mtu_consistent", "MTU consistent across nodes", wgSub, "",
|
||||
fmt.Sprintf("%d different MTU values", len(mtus)), inspector.High))
|
||||
}
|
||||
|
||||
// 5.50 Public key uniqueness
|
||||
allKeys := map[string][]string{}
|
||||
for _, n := range nodes {
|
||||
for _, peer := range n.wg.Peers {
|
||||
allKeys[peer.PublicKey] = append(allKeys[peer.PublicKey], n.name)
|
||||
}
|
||||
}
|
||||
dupeKeys := 0
|
||||
for _, names := range allKeys {
|
||||
if len(names) > len(nodes)-1 {
|
||||
dupeKeys++
|
||||
}
|
||||
}
|
||||
// If all good, the same key should appear at most N-1 times (once per other node)
|
||||
if dupeKeys == 0 {
|
||||
r = append(r, inspector.Pass("wg.key_uniqueness", "Public keys unique across nodes", wgSub, "",
|
||||
fmt.Sprintf("%d unique peer keys", len(allKeys)), inspector.Critical))
|
||||
}
|
||||
|
||||
return r
|
||||
}
|
||||
|
||||
func countWGNodes(data *inspector.ClusterData) int {
|
||||
count := 0
|
||||
for _, nd := range data.Nodes {
|
||||
if nd.WireGuard != nil {
|
||||
count++
|
||||
}
|
||||
}
|
||||
return count
|
||||
}
|
||||
230
pkg/inspector/checks/wireguard_test.go
Normal file
230
pkg/inspector/checks/wireguard_test.go
Normal file
@ -0,0 +1,230 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/inspector"
|
||||
)
|
||||
|
||||
func TestCheckWireGuard_InterfaceDown(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.WireGuard = &inspector.WireGuardData{InterfaceUp: false}
|
||||
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckWireGuard(data)
|
||||
|
||||
expectStatus(t, results, "wg.interface_up", inspector.StatusFail)
|
||||
// Early return — no further per-node checks
|
||||
if findCheck(results, "wg.service_active") != nil {
|
||||
t.Error("should not check service_active when interface down")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckWireGuard_HealthyNode(t *testing.T) {
|
||||
now := time.Now().Unix()
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.WireGuard = &inspector.WireGuardData{
|
||||
InterfaceUp: true,
|
||||
ServiceActive: true,
|
||||
WgIP: "10.0.0.1",
|
||||
ListenPort: 51820,
|
||||
PeerCount: 2,
|
||||
MTU: 1420,
|
||||
ConfigExists: true,
|
||||
ConfigPerms: "600",
|
||||
Peers: []inspector.WGPeer{
|
||||
{PublicKey: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=", AllowedIPs: "10.0.0.2/32", LatestHandshake: now - 30, TransferRx: 1000, TransferTx: 2000},
|
||||
{PublicKey: "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB=", AllowedIPs: "10.0.0.3/32", LatestHandshake: now - 60, TransferRx: 500, TransferTx: 800},
|
||||
},
|
||||
}
|
||||
|
||||
// Single-node for per-node assertions (avoids helper node interference)
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckWireGuard(data)
|
||||
|
||||
expectStatus(t, results, "wg.interface_up", inspector.StatusPass)
|
||||
expectStatus(t, results, "wg.service_active", inspector.StatusPass)
|
||||
expectStatus(t, results, "wg.correct_ip", inspector.StatusPass)
|
||||
expectStatus(t, results, "wg.listen_port", inspector.StatusPass)
|
||||
expectStatus(t, results, "wg.mtu", inspector.StatusPass)
|
||||
expectStatus(t, results, "wg.config_exists", inspector.StatusPass)
|
||||
expectStatus(t, results, "wg.config_perms", inspector.StatusPass)
|
||||
expectStatus(t, results, "wg.handshake_all", inspector.StatusPass)
|
||||
expectStatus(t, results, "wg.handshake_fresh", inspector.StatusPass)
|
||||
expectStatus(t, results, "wg.peer_traffic", inspector.StatusPass)
|
||||
}
|
||||
|
||||
func TestCheckWireGuard_WrongIP(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.WireGuard = &inspector.WireGuardData{
|
||||
InterfaceUp: true,
|
||||
WgIP: "192.168.1.5",
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckWireGuard(data)
|
||||
expectStatus(t, results, "wg.correct_ip", inspector.StatusWarn)
|
||||
}
|
||||
|
||||
func TestCheckWireGuard_WrongPort(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.WireGuard = &inspector.WireGuardData{
|
||||
InterfaceUp: true,
|
||||
WgIP: "10.0.0.1",
|
||||
ListenPort: 12345,
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckWireGuard(data)
|
||||
expectStatus(t, results, "wg.listen_port", inspector.StatusWarn)
|
||||
}
|
||||
|
||||
func TestCheckWireGuard_PeerCountMismatch(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.WireGuard = &inspector.WireGuardData{InterfaceUp: true, WgIP: "10.0.0.1", PeerCount: 1}
|
||||
|
||||
nodes := map[string]*inspector.NodeData{"1.1.1.1": nd}
|
||||
for _, host := range []string{"2.2.2.2", "3.3.3.3", "4.4.4.4"} {
|
||||
other := makeNodeData(host, "node")
|
||||
other.WireGuard = &inspector.WireGuardData{InterfaceUp: true, PeerCount: 3}
|
||||
nodes[host] = other
|
||||
}
|
||||
data := makeCluster(nodes)
|
||||
results := CheckWireGuard(data)
|
||||
|
||||
// Node 1.1.1.1 has 1 peer but expects 3 (4 nodes - 1)
|
||||
c := findCheck(results, "wg.peer_count")
|
||||
if c == nil {
|
||||
t.Fatal("expected wg.peer_count check")
|
||||
}
|
||||
// At least one node should have a warn
|
||||
hasWarn := false
|
||||
for _, r := range results {
|
||||
if r.ID == "wg.peer_count" && r.Status == inspector.StatusWarn {
|
||||
hasWarn = true
|
||||
}
|
||||
}
|
||||
if !hasWarn {
|
||||
t.Error("expected at least one wg.peer_count warn for mismatched peer count")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckWireGuard_ZeroPeers(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.WireGuard = &inspector.WireGuardData{InterfaceUp: true, WgIP: "10.0.0.1", PeerCount: 0}
|
||||
|
||||
nodes := map[string]*inspector.NodeData{"1.1.1.1": nd}
|
||||
for _, host := range []string{"2.2.2.2", "3.3.3.3"} {
|
||||
other := makeNodeData(host, "node")
|
||||
other.WireGuard = &inspector.WireGuardData{InterfaceUp: true, PeerCount: 2}
|
||||
nodes[host] = other
|
||||
}
|
||||
data := makeCluster(nodes)
|
||||
results := CheckWireGuard(data)
|
||||
|
||||
// At least one node should fail (zero peers = isolated)
|
||||
hasFail := false
|
||||
for _, r := range results {
|
||||
if r.ID == "wg.peer_count" && r.Status == inspector.StatusFail {
|
||||
hasFail = true
|
||||
}
|
||||
}
|
||||
if !hasFail {
|
||||
t.Error("expected wg.peer_count fail for isolated node")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckWireGuard_StaleHandshakes(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.WireGuard = &inspector.WireGuardData{
|
||||
InterfaceUp: true,
|
||||
WgIP: "10.0.0.1",
|
||||
PeerCount: 2,
|
||||
Peers: []inspector.WGPeer{
|
||||
{PublicKey: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=", AllowedIPs: "10.0.0.2/32", LatestHandshake: time.Now().Unix() - 600, TransferRx: 100, TransferTx: 200},
|
||||
{PublicKey: "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB=", AllowedIPs: "10.0.0.3/32", LatestHandshake: time.Now().Unix() - 600, TransferRx: 100, TransferTx: 200},
|
||||
},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckWireGuard(data)
|
||||
expectStatus(t, results, "wg.handshake_fresh", inspector.StatusWarn)
|
||||
}
|
||||
|
||||
func TestCheckWireGuard_NeverHandshaked(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.WireGuard = &inspector.WireGuardData{
|
||||
InterfaceUp: true,
|
||||
WgIP: "10.0.0.1",
|
||||
PeerCount: 1,
|
||||
Peers: []inspector.WGPeer{
|
||||
{PublicKey: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=", AllowedIPs: "10.0.0.2/32", LatestHandshake: 0},
|
||||
},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckWireGuard(data)
|
||||
expectStatus(t, results, "wg.handshake_all", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckWireGuard_NoTraffic(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.WireGuard = &inspector.WireGuardData{
|
||||
InterfaceUp: true,
|
||||
WgIP: "10.0.0.1",
|
||||
PeerCount: 1,
|
||||
Peers: []inspector.WGPeer{
|
||||
{PublicKey: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=", AllowedIPs: "10.0.0.2/32", LatestHandshake: time.Now().Unix(), TransferRx: 0, TransferTx: 0},
|
||||
},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckWireGuard(data)
|
||||
expectStatus(t, results, "wg.peer_traffic", inspector.StatusWarn)
|
||||
}
|
||||
|
||||
func TestCheckWireGuard_CatchAllRoute(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
nd.WireGuard = &inspector.WireGuardData{
|
||||
InterfaceUp: true,
|
||||
WgIP: "10.0.0.1",
|
||||
PeerCount: 1,
|
||||
Peers: []inspector.WGPeer{
|
||||
{PublicKey: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=", AllowedIPs: "0.0.0.0/0", LatestHandshake: time.Now().Unix(), TransferRx: 100, TransferTx: 200},
|
||||
},
|
||||
}
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckWireGuard(data)
|
||||
expectStatus(t, results, "wg.peer_catch_all", inspector.StatusFail)
|
||||
}
|
||||
|
||||
func TestCheckWireGuard_CrossNode_PeerCountConsistent(t *testing.T) {
|
||||
nodes := map[string]*inspector.NodeData{}
|
||||
for _, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
|
||||
nd := makeNodeData(host, "node")
|
||||
nd.WireGuard = &inspector.WireGuardData{InterfaceUp: true, PeerCount: 2, MTU: 1420}
|
||||
nodes[host] = nd
|
||||
}
|
||||
data := makeCluster(nodes)
|
||||
results := CheckWireGuard(data)
|
||||
expectStatus(t, results, "wg.peer_count_consistent", inspector.StatusPass)
|
||||
expectStatus(t, results, "wg.mtu_consistent", inspector.StatusPass)
|
||||
}
|
||||
|
||||
func TestCheckWireGuard_CrossNode_PeerCountInconsistent(t *testing.T) {
|
||||
nodes := map[string]*inspector.NodeData{}
|
||||
counts := []int{2, 2, 1}
|
||||
for i, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
|
||||
nd := makeNodeData(host, "node")
|
||||
nd.WireGuard = &inspector.WireGuardData{InterfaceUp: true, PeerCount: counts[i], MTU: 1420}
|
||||
nodes[host] = nd
|
||||
}
|
||||
data := makeCluster(nodes)
|
||||
results := CheckWireGuard(data)
|
||||
expectStatus(t, results, "wg.peer_count_consistent", inspector.StatusWarn)
|
||||
}
|
||||
|
||||
func TestCheckWireGuard_NilData(t *testing.T) {
|
||||
nd := makeNodeData("1.1.1.1", "node")
|
||||
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
|
||||
results := CheckWireGuard(data)
|
||||
if len(results) != 0 {
|
||||
t.Errorf("expected 0 results for nil WireGuard data, got %d", len(results))
|
||||
}
|
||||
}
|
||||
1268
pkg/inspector/collector.go
Normal file
1268
pkg/inspector/collector.go
Normal file
File diff suppressed because it is too large
Load Diff
118
pkg/inspector/config.go
Normal file
118
pkg/inspector/config.go
Normal file
@ -0,0 +1,118 @@
|
||||
package inspector
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Node represents a remote node parsed from remote-nodes.conf.
|
||||
type Node struct {
|
||||
Environment string // devnet, testnet
|
||||
User string // SSH user
|
||||
Host string // IP or hostname
|
||||
Password string // SSH password
|
||||
Role string // node, nameserver-ns1, nameserver-ns2, nameserver-ns3
|
||||
SSHKey string // optional path to SSH key
|
||||
}
|
||||
|
||||
// Name returns a short display name for the node (user@host).
|
||||
func (n Node) Name() string {
|
||||
return fmt.Sprintf("%s@%s", n.User, n.Host)
|
||||
}
|
||||
|
||||
// IsNameserver returns true if the node has a nameserver role.
|
||||
func (n Node) IsNameserver() bool {
|
||||
return strings.HasPrefix(n.Role, "nameserver")
|
||||
}
|
||||
|
||||
// LoadNodes parses a remote-nodes.conf file into a slice of Nodes.
|
||||
// Format: environment|user@host|password|role|ssh_key (ssh_key optional)
|
||||
func LoadNodes(path string) ([]Node, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("open config: %w", err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
var nodes []Node
|
||||
scanner := bufio.NewScanner(f)
|
||||
lineNum := 0
|
||||
for scanner.Scan() {
|
||||
lineNum++
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
if line == "" || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
|
||||
parts := strings.SplitN(line, "|", 5)
|
||||
if len(parts) < 4 {
|
||||
return nil, fmt.Errorf("line %d: expected at least 4 pipe-delimited fields, got %d", lineNum, len(parts))
|
||||
}
|
||||
|
||||
env := parts[0]
|
||||
userHost := parts[1]
|
||||
password := parts[2]
|
||||
role := parts[3]
|
||||
|
||||
var sshKey string
|
||||
if len(parts) == 5 {
|
||||
sshKey = parts[4]
|
||||
}
|
||||
|
||||
// Parse user@host
|
||||
at := strings.LastIndex(userHost, "@")
|
||||
if at < 0 {
|
||||
return nil, fmt.Errorf("line %d: expected user@host format, got %q", lineNum, userHost)
|
||||
}
|
||||
user := userHost[:at]
|
||||
host := userHost[at+1:]
|
||||
|
||||
nodes = append(nodes, Node{
|
||||
Environment: env,
|
||||
User: user,
|
||||
Host: host,
|
||||
Password: password,
|
||||
Role: role,
|
||||
SSHKey: sshKey,
|
||||
})
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
return nil, fmt.Errorf("reading config: %w", err)
|
||||
}
|
||||
return nodes, nil
|
||||
}
|
||||
|
||||
// FilterByEnv returns only nodes matching the given environment.
|
||||
func FilterByEnv(nodes []Node, env string) []Node {
|
||||
var filtered []Node
|
||||
for _, n := range nodes {
|
||||
if n.Environment == env {
|
||||
filtered = append(filtered, n)
|
||||
}
|
||||
}
|
||||
return filtered
|
||||
}
|
||||
|
||||
// FilterByRole returns only nodes matching the given role prefix.
|
||||
func FilterByRole(nodes []Node, rolePrefix string) []Node {
|
||||
var filtered []Node
|
||||
for _, n := range nodes {
|
||||
if strings.HasPrefix(n.Role, rolePrefix) {
|
||||
filtered = append(filtered, n)
|
||||
}
|
||||
}
|
||||
return filtered
|
||||
}
|
||||
|
||||
// RegularNodes returns non-nameserver nodes.
|
||||
func RegularNodes(nodes []Node) []Node {
|
||||
var filtered []Node
|
||||
for _, n := range nodes {
|
||||
if n.Role == "node" {
|
||||
filtered = append(filtered, n)
|
||||
}
|
||||
}
|
||||
return filtered
|
||||
}
|
||||
179
pkg/inspector/config_test.go
Normal file
179
pkg/inspector/config_test.go
Normal file
@ -0,0 +1,179 @@
|
||||
package inspector
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestLoadNodes(t *testing.T) {
|
||||
content := `# Comment line
|
||||
devnet|ubuntu@1.2.3.4|pass123|node
|
||||
devnet|ubuntu@1.2.3.5|pass456|node
|
||||
devnet|ubuntu@5.6.7.8|pass789|nameserver-ns1|/path/to/key
|
||||
`
|
||||
path := writeTempFile(t, content)
|
||||
|
||||
nodes, err := LoadNodes(path)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadNodes: %v", err)
|
||||
}
|
||||
if len(nodes) != 3 {
|
||||
t.Fatalf("want 3 nodes, got %d", len(nodes))
|
||||
}
|
||||
|
||||
// First node
|
||||
n := nodes[0]
|
||||
if n.Environment != "devnet" {
|
||||
t.Errorf("node[0].Environment = %q, want devnet", n.Environment)
|
||||
}
|
||||
if n.User != "ubuntu" {
|
||||
t.Errorf("node[0].User = %q, want ubuntu", n.User)
|
||||
}
|
||||
if n.Host != "1.2.3.4" {
|
||||
t.Errorf("node[0].Host = %q, want 1.2.3.4", n.Host)
|
||||
}
|
||||
if n.Password != "pass123" {
|
||||
t.Errorf("node[0].Password = %q, want pass123", n.Password)
|
||||
}
|
||||
if n.Role != "node" {
|
||||
t.Errorf("node[0].Role = %q, want node", n.Role)
|
||||
}
|
||||
if n.SSHKey != "" {
|
||||
t.Errorf("node[0].SSHKey = %q, want empty", n.SSHKey)
|
||||
}
|
||||
|
||||
// Third node with SSH key
|
||||
n3 := nodes[2]
|
||||
if n3.Role != "nameserver-ns1" {
|
||||
t.Errorf("node[2].Role = %q, want nameserver-ns1", n3.Role)
|
||||
}
|
||||
if n3.SSHKey != "/path/to/key" {
|
||||
t.Errorf("node[2].SSHKey = %q, want /path/to/key", n3.SSHKey)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadNodes_EmptyLines(t *testing.T) {
|
||||
content := `
|
||||
# Full line comment
|
||||
|
||||
devnet|ubuntu@1.2.3.4|pass|node
|
||||
|
||||
# Another comment
|
||||
devnet|ubuntu@1.2.3.5|pass|node
|
||||
`
|
||||
path := writeTempFile(t, content)
|
||||
|
||||
nodes, err := LoadNodes(path)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadNodes: %v", err)
|
||||
}
|
||||
if len(nodes) != 2 {
|
||||
t.Fatalf("want 2 nodes (blank/comment lines skipped), got %d", len(nodes))
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadNodes_InvalidFormat(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
content string
|
||||
}{
|
||||
{"too few fields", "devnet|ubuntu@1.2.3.4|pass\n"},
|
||||
{"no @ in userhost", "devnet|localhost|pass|node\n"},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
path := writeTempFile(t, tt.content)
|
||||
_, err := LoadNodes(path)
|
||||
if err == nil {
|
||||
t.Error("expected error for invalid format")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadNodes_FileNotFound(t *testing.T) {
|
||||
_, err := LoadNodes("/nonexistent/path/file.conf")
|
||||
if err == nil {
|
||||
t.Error("expected error for nonexistent file")
|
||||
}
|
||||
}
|
||||
|
||||
func TestFilterByEnv(t *testing.T) {
|
||||
nodes := []Node{
|
||||
{Environment: "devnet", Host: "1.1.1.1"},
|
||||
{Environment: "testnet", Host: "2.2.2.2"},
|
||||
{Environment: "devnet", Host: "3.3.3.3"},
|
||||
}
|
||||
filtered := FilterByEnv(nodes, "devnet")
|
||||
if len(filtered) != 2 {
|
||||
t.Fatalf("want 2 devnet nodes, got %d", len(filtered))
|
||||
}
|
||||
for _, n := range filtered {
|
||||
if n.Environment != "devnet" {
|
||||
t.Errorf("got env=%s, want devnet", n.Environment)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFilterByRole(t *testing.T) {
|
||||
nodes := []Node{
|
||||
{Role: "node", Host: "1.1.1.1"},
|
||||
{Role: "nameserver-ns1", Host: "2.2.2.2"},
|
||||
{Role: "nameserver-ns2", Host: "3.3.3.3"},
|
||||
{Role: "node", Host: "4.4.4.4"},
|
||||
}
|
||||
filtered := FilterByRole(nodes, "nameserver")
|
||||
if len(filtered) != 2 {
|
||||
t.Fatalf("want 2 nameserver nodes, got %d", len(filtered))
|
||||
}
|
||||
}
|
||||
|
||||
func TestRegularNodes(t *testing.T) {
|
||||
nodes := []Node{
|
||||
{Role: "node", Host: "1.1.1.1"},
|
||||
{Role: "nameserver-ns1", Host: "2.2.2.2"},
|
||||
{Role: "node", Host: "3.3.3.3"},
|
||||
}
|
||||
regular := RegularNodes(nodes)
|
||||
if len(regular) != 2 {
|
||||
t.Fatalf("want 2 regular nodes, got %d", len(regular))
|
||||
}
|
||||
}
|
||||
|
||||
func TestNode_Name(t *testing.T) {
|
||||
n := Node{User: "ubuntu", Host: "1.2.3.4"}
|
||||
if got := n.Name(); got != "ubuntu@1.2.3.4" {
|
||||
t.Errorf("Name() = %q, want ubuntu@1.2.3.4", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNode_IsNameserver(t *testing.T) {
|
||||
tests := []struct {
|
||||
role string
|
||||
want bool
|
||||
}{
|
||||
{"nameserver-ns1", true},
|
||||
{"nameserver-ns2", true},
|
||||
{"node", false},
|
||||
{"", false},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.role, func(t *testing.T) {
|
||||
n := Node{Role: tt.role}
|
||||
if got := n.IsNameserver(); got != tt.want {
|
||||
t.Errorf("IsNameserver(%q) = %v, want %v", tt.role, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func writeTempFile(t *testing.T, content string) string {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "test-nodes.conf")
|
||||
if err := os.WriteFile(path, []byte(content), 0644); err != nil {
|
||||
t.Fatalf("write temp file: %v", err)
|
||||
}
|
||||
return path
|
||||
}
|
||||
136
pkg/inspector/report.go
Normal file
136
pkg/inspector/report.go
Normal file
@ -0,0 +1,136 @@
|
||||
package inspector
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// PrintTable writes a human-readable table of check results.
|
||||
func PrintTable(results *Results, w io.Writer) {
|
||||
if len(results.Checks) == 0 {
|
||||
fmt.Fprintf(w, "No checks executed.\n")
|
||||
return
|
||||
}
|
||||
|
||||
// Sort: failures first, then warnings, then passes, then skips.
|
||||
// Within each group, sort by severity (critical first).
|
||||
sorted := make([]CheckResult, len(results.Checks))
|
||||
copy(sorted, results.Checks)
|
||||
sort.Slice(sorted, func(i, j int) bool {
|
||||
oi, oj := statusOrder(sorted[i].Status), statusOrder(sorted[j].Status)
|
||||
if oi != oj {
|
||||
return oi < oj
|
||||
}
|
||||
// Higher severity first
|
||||
if sorted[i].Severity != sorted[j].Severity {
|
||||
return sorted[i].Severity > sorted[j].Severity
|
||||
}
|
||||
return sorted[i].ID < sorted[j].ID
|
||||
})
|
||||
|
||||
// Group by subsystem
|
||||
groups := map[string][]CheckResult{}
|
||||
var subsystems []string
|
||||
for _, c := range sorted {
|
||||
if _, exists := groups[c.Subsystem]; !exists {
|
||||
subsystems = append(subsystems, c.Subsystem)
|
||||
}
|
||||
groups[c.Subsystem] = append(groups[c.Subsystem], c)
|
||||
}
|
||||
|
||||
for _, sub := range subsystems {
|
||||
checks := groups[sub]
|
||||
fmt.Fprintf(w, "\n%s %s\n", severityIcon(Critical), strings.ToUpper(sub))
|
||||
fmt.Fprintf(w, "%s\n", strings.Repeat("-", 70))
|
||||
|
||||
for _, c := range checks {
|
||||
icon := statusIcon(c.Status)
|
||||
sev := fmt.Sprintf("[%s]", c.Severity)
|
||||
nodePart := ""
|
||||
if c.Node != "" {
|
||||
nodePart = fmt.Sprintf(" (%s)", c.Node)
|
||||
}
|
||||
fmt.Fprintf(w, " %s %-8s %s%s\n", icon, sev, c.Name, nodePart)
|
||||
if c.Message != "" {
|
||||
fmt.Fprintf(w, " %s\n", c.Message)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
passed, failed, warned, skipped := results.Summary()
|
||||
fmt.Fprintf(w, "\n%s\n", strings.Repeat("=", 70))
|
||||
fmt.Fprintf(w, "Summary: %d passed, %d failed, %d warnings, %d skipped (%.1fs)\n",
|
||||
passed, failed, warned, skipped, results.Duration.Seconds())
|
||||
}
|
||||
|
||||
// PrintJSON writes check results as JSON.
|
||||
func PrintJSON(results *Results, w io.Writer) {
|
||||
passed, failed, warned, skipped := results.Summary()
|
||||
output := struct {
|
||||
Summary struct {
|
||||
Passed int `json:"passed"`
|
||||
Failed int `json:"failed"`
|
||||
Warned int `json:"warned"`
|
||||
Skipped int `json:"skipped"`
|
||||
Total int `json:"total"`
|
||||
Seconds float64 `json:"duration_seconds"`
|
||||
} `json:"summary"`
|
||||
Checks []CheckResult `json:"checks"`
|
||||
}{
|
||||
Checks: results.Checks,
|
||||
}
|
||||
output.Summary.Passed = passed
|
||||
output.Summary.Failed = failed
|
||||
output.Summary.Warned = warned
|
||||
output.Summary.Skipped = skipped
|
||||
output.Summary.Total = len(results.Checks)
|
||||
output.Summary.Seconds = results.Duration.Seconds()
|
||||
|
||||
enc := json.NewEncoder(w)
|
||||
enc.SetIndent("", " ")
|
||||
enc.Encode(output)
|
||||
}
|
||||
|
||||
// SummaryLine returns a one-line summary string.
|
||||
func SummaryLine(results *Results) string {
|
||||
passed, failed, warned, skipped := results.Summary()
|
||||
return fmt.Sprintf("%d passed, %d failed, %d warnings, %d skipped",
|
||||
passed, failed, warned, skipped)
|
||||
}
|
||||
|
||||
func statusOrder(s Status) int {
|
||||
switch s {
|
||||
case StatusFail:
|
||||
return 0
|
||||
case StatusWarn:
|
||||
return 1
|
||||
case StatusPass:
|
||||
return 2
|
||||
case StatusSkip:
|
||||
return 3
|
||||
default:
|
||||
return 4
|
||||
}
|
||||
}
|
||||
|
||||
func statusIcon(s Status) string {
|
||||
switch s {
|
||||
case StatusPass:
|
||||
return "OK"
|
||||
case StatusFail:
|
||||
return "FAIL"
|
||||
case StatusWarn:
|
||||
return "WARN"
|
||||
case StatusSkip:
|
||||
return "SKIP"
|
||||
default:
|
||||
return "??"
|
||||
}
|
||||
}
|
||||
|
||||
func severityIcon(_ Severity) string {
|
||||
return "##"
|
||||
}
|
||||
135
pkg/inspector/report_test.go
Normal file
135
pkg/inspector/report_test.go
Normal file
@ -0,0 +1,135 @@
|
||||
package inspector
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestPrintTable_EmptyResults(t *testing.T) {
|
||||
r := &Results{}
|
||||
var buf bytes.Buffer
|
||||
PrintTable(r, &buf)
|
||||
if !strings.Contains(buf.String(), "No checks executed") {
|
||||
t.Errorf("expected 'No checks executed', got %q", buf.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrintTable_SortsFailuresFirst(t *testing.T) {
|
||||
r := &Results{
|
||||
Duration: time.Second,
|
||||
Checks: []CheckResult{
|
||||
{ID: "a", Name: "Pass check", Subsystem: "test", Status: StatusPass, Severity: Low},
|
||||
{ID: "b", Name: "Fail check", Subsystem: "test", Status: StatusFail, Severity: Critical},
|
||||
{ID: "c", Name: "Warn check", Subsystem: "test", Status: StatusWarn, Severity: High},
|
||||
},
|
||||
}
|
||||
var buf bytes.Buffer
|
||||
PrintTable(r, &buf)
|
||||
output := buf.String()
|
||||
|
||||
// FAIL should appear before WARN, which should appear before OK
|
||||
failIdx := strings.Index(output, "FAIL")
|
||||
warnIdx := strings.Index(output, "WARN")
|
||||
okIdx := strings.Index(output, "OK")
|
||||
|
||||
if failIdx < 0 || warnIdx < 0 || okIdx < 0 {
|
||||
t.Fatalf("expected FAIL, WARN, and OK in output:\n%s", output)
|
||||
}
|
||||
if failIdx > warnIdx {
|
||||
t.Errorf("FAIL (pos %d) should appear before WARN (pos %d)", failIdx, warnIdx)
|
||||
}
|
||||
if warnIdx > okIdx {
|
||||
t.Errorf("WARN (pos %d) should appear before OK (pos %d)", warnIdx, okIdx)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrintTable_IncludesNode(t *testing.T) {
|
||||
r := &Results{
|
||||
Duration: time.Second,
|
||||
Checks: []CheckResult{
|
||||
{ID: "a", Name: "Check A", Subsystem: "test", Status: StatusPass, Node: "ubuntu@1.2.3.4"},
|
||||
},
|
||||
}
|
||||
var buf bytes.Buffer
|
||||
PrintTable(r, &buf)
|
||||
if !strings.Contains(buf.String(), "ubuntu@1.2.3.4") {
|
||||
t.Error("expected node name in table output")
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrintTable_IncludesSummary(t *testing.T) {
|
||||
r := &Results{
|
||||
Duration: 2 * time.Second,
|
||||
Checks: []CheckResult{
|
||||
{ID: "a", Subsystem: "test", Status: StatusPass},
|
||||
{ID: "b", Subsystem: "test", Status: StatusFail},
|
||||
},
|
||||
}
|
||||
var buf bytes.Buffer
|
||||
PrintTable(r, &buf)
|
||||
output := buf.String()
|
||||
if !strings.Contains(output, "1 passed") {
|
||||
t.Error("summary should mention passed count")
|
||||
}
|
||||
if !strings.Contains(output, "1 failed") {
|
||||
t.Error("summary should mention failed count")
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrintJSON_ValidJSON(t *testing.T) {
|
||||
r := &Results{
|
||||
Duration: time.Second,
|
||||
Checks: []CheckResult{
|
||||
{ID: "a", Name: "A", Subsystem: "test", Status: StatusPass, Severity: Low, Message: "ok"},
|
||||
{ID: "b", Name: "B", Subsystem: "test", Status: StatusFail, Severity: High, Message: "bad"},
|
||||
},
|
||||
}
|
||||
var buf bytes.Buffer
|
||||
PrintJSON(r, &buf)
|
||||
|
||||
var parsed map[string]interface{}
|
||||
if err := json.Unmarshal(buf.Bytes(), &parsed); err != nil {
|
||||
t.Fatalf("output is not valid JSON: %v\nraw: %s", err, buf.String())
|
||||
}
|
||||
|
||||
summary, ok := parsed["summary"].(map[string]interface{})
|
||||
if !ok {
|
||||
t.Fatal("missing 'summary' object in JSON")
|
||||
}
|
||||
if v := summary["passed"]; v != float64(1) {
|
||||
t.Errorf("summary.passed = %v, want 1", v)
|
||||
}
|
||||
if v := summary["failed"]; v != float64(1) {
|
||||
t.Errorf("summary.failed = %v, want 1", v)
|
||||
}
|
||||
if v := summary["total"]; v != float64(2) {
|
||||
t.Errorf("summary.total = %v, want 2", v)
|
||||
}
|
||||
|
||||
checks, ok := parsed["checks"].([]interface{})
|
||||
if !ok {
|
||||
t.Fatal("missing 'checks' array in JSON")
|
||||
}
|
||||
if len(checks) != 2 {
|
||||
t.Errorf("want 2 checks, got %d", len(checks))
|
||||
}
|
||||
}
|
||||
|
||||
func TestSummaryLine(t *testing.T) {
|
||||
r := &Results{
|
||||
Checks: []CheckResult{
|
||||
{Status: StatusPass},
|
||||
{Status: StatusPass},
|
||||
{Status: StatusFail},
|
||||
{Status: StatusWarn},
|
||||
},
|
||||
}
|
||||
got := SummaryLine(r)
|
||||
want := "2 passed, 1 failed, 1 warnings, 0 skipped"
|
||||
if got != want {
|
||||
t.Errorf("SummaryLine = %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
165
pkg/inspector/ssh.go
Normal file
165
pkg/inspector/ssh.go
Normal file
@ -0,0 +1,165 @@
|
||||
package inspector
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
sshMaxRetries = 3
|
||||
sshRetryDelay = 2 * time.Second
|
||||
)
|
||||
|
||||
// SSHResult holds the output of an SSH command execution.
|
||||
type SSHResult struct {
|
||||
Stdout string
|
||||
Stderr string
|
||||
ExitCode int
|
||||
Duration time.Duration
|
||||
Err error
|
||||
Retries int // how many retries were needed
|
||||
}
|
||||
|
||||
// OK returns true if the command succeeded (exit code 0, no error).
|
||||
func (r SSHResult) OK() bool {
|
||||
return r.Err == nil && r.ExitCode == 0
|
||||
}
|
||||
|
||||
// RunSSH executes a command on a remote node via SSH with retry on connection failure.
|
||||
// Uses sshpass for password auth, falls back to -i for key-based auth.
|
||||
// The -n flag is used to prevent SSH from reading stdin.
|
||||
func RunSSH(ctx context.Context, node Node, command string) SSHResult {
|
||||
var result SSHResult
|
||||
for attempt := 0; attempt <= sshMaxRetries; attempt++ {
|
||||
result = runSSHOnce(ctx, node, command)
|
||||
result.Retries = attempt
|
||||
|
||||
// Success — return immediately
|
||||
if result.OK() {
|
||||
return result
|
||||
}
|
||||
|
||||
// If the command ran but returned non-zero exit, that's the remote command
|
||||
// failing (not a connection issue) — don't retry
|
||||
if result.Err == nil && result.ExitCode != 0 {
|
||||
return result
|
||||
}
|
||||
|
||||
// Check if it's a connection-level failure worth retrying
|
||||
if !isSSHConnectionError(result) {
|
||||
return result
|
||||
}
|
||||
|
||||
// Don't retry if context is done
|
||||
if ctx.Err() != nil {
|
||||
return result
|
||||
}
|
||||
|
||||
// Wait before retry (except on last attempt)
|
||||
if attempt < sshMaxRetries {
|
||||
select {
|
||||
case <-time.After(sshRetryDelay):
|
||||
case <-ctx.Done():
|
||||
return result
|
||||
}
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// runSSHOnce executes a single SSH attempt.
|
||||
func runSSHOnce(ctx context.Context, node Node, command string) SSHResult {
|
||||
start := time.Now()
|
||||
|
||||
var args []string
|
||||
if node.SSHKey != "" {
|
||||
// Key-based auth
|
||||
args = []string{
|
||||
"ssh", "-n",
|
||||
"-o", "StrictHostKeyChecking=no",
|
||||
"-o", "ConnectTimeout=10",
|
||||
"-o", "BatchMode=yes",
|
||||
"-i", node.SSHKey,
|
||||
fmt.Sprintf("%s@%s", node.User, node.Host),
|
||||
command,
|
||||
}
|
||||
} else {
|
||||
// Password auth via sshpass
|
||||
args = []string{
|
||||
"sshpass", "-p", node.Password,
|
||||
"ssh", "-n",
|
||||
"-o", "StrictHostKeyChecking=no",
|
||||
"-o", "ConnectTimeout=10",
|
||||
fmt.Sprintf("%s@%s", node.User, node.Host),
|
||||
command,
|
||||
}
|
||||
}
|
||||
|
||||
cmd := exec.CommandContext(ctx, args[0], args[1:]...)
|
||||
|
||||
var stdout, stderr bytes.Buffer
|
||||
cmd.Stdout = &stdout
|
||||
cmd.Stderr = &stderr
|
||||
|
||||
err := cmd.Run()
|
||||
duration := time.Since(start)
|
||||
|
||||
exitCode := 0
|
||||
if err != nil {
|
||||
if exitErr, ok := err.(*exec.ExitError); ok {
|
||||
if status, ok := exitErr.Sys().(syscall.WaitStatus); ok {
|
||||
exitCode = status.ExitStatus()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return SSHResult{
|
||||
Stdout: strings.TrimSpace(stdout.String()),
|
||||
Stderr: strings.TrimSpace(stderr.String()),
|
||||
ExitCode: exitCode,
|
||||
Duration: duration,
|
||||
Err: err,
|
||||
}
|
||||
}
|
||||
|
||||
// isSSHConnectionError returns true if the failure looks like an SSH connection
|
||||
// problem (timeout, refused, network unreachable) rather than a remote command error.
|
||||
func isSSHConnectionError(r SSHResult) bool {
|
||||
// sshpass exit code 5 = invalid/incorrect password (not retriable)
|
||||
// sshpass exit code 6 = host key verification failed (not retriable)
|
||||
// SSH exit code 255 = SSH connection error (retriable)
|
||||
if r.ExitCode == 255 {
|
||||
return true
|
||||
}
|
||||
|
||||
stderr := strings.ToLower(r.Stderr)
|
||||
connectionErrors := []string{
|
||||
"connection refused",
|
||||
"connection timed out",
|
||||
"connection reset",
|
||||
"no route to host",
|
||||
"network is unreachable",
|
||||
"could not resolve hostname",
|
||||
"ssh_exchange_identification",
|
||||
"broken pipe",
|
||||
"connection closed by remote host",
|
||||
}
|
||||
for _, pattern := range connectionErrors {
|
||||
if strings.Contains(stderr, pattern) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// RunSSHMulti executes a multi-command string on a remote node.
|
||||
// Commands are joined with " && " so failure stops execution.
|
||||
func RunSSHMulti(ctx context.Context, node Node, commands []string) SSHResult {
|
||||
combined := strings.Join(commands, " && ")
|
||||
return RunSSH(ctx, node, combined)
|
||||
}
|
||||
@ -61,7 +61,9 @@ func (cm *ClusterConfigManager) UpdateAllClusterPeers() error {
|
||||
func (cm *ClusterConfigManager) RepairPeerConfiguration() error {
|
||||
cm.logger.Info("Attempting to repair IPFS Cluster peer configuration")
|
||||
|
||||
_ = cm.FixIPFSConfigAddresses()
|
||||
if err := cm.FixIPFSConfigAddresses(); err != nil {
|
||||
cm.logger.Warn("Failed to fix IPFS config addresses during repair", zap.Error(err))
|
||||
}
|
||||
|
||||
peers, err := cm.DiscoverClusterPeersFromGateway()
|
||||
if err != nil {
|
||||
@ -72,7 +74,9 @@ func (cm *ClusterConfigManager) RepairPeerConfiguration() error {
|
||||
peerAddrs = append(peerAddrs, p.Multiaddress)
|
||||
}
|
||||
if len(peerAddrs) > 0 {
|
||||
_ = cm.UpdatePeerAddresses(peerAddrs)
|
||||
if err := cm.UpdatePeerAddresses(peerAddrs); err != nil {
|
||||
cm.logger.Warn("Failed to update peer addresses during repair", zap.Error(err))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -77,19 +77,6 @@ func parseIPFSPort(rawURL string) (int, error) {
|
||||
return port, nil
|
||||
}
|
||||
|
||||
func parsePeerHostAndPort(multiaddr string) (string, int) {
|
||||
parts := strings.Split(multiaddr, "/")
|
||||
var hostStr string
|
||||
var port int
|
||||
for i, part := range parts {
|
||||
if part == "ip4" || part == "dns" || part == "dns4" {
|
||||
hostStr = parts[i+1]
|
||||
} else if part == "tcp" {
|
||||
fmt.Sscanf(parts[i+1], "%d", &port)
|
||||
}
|
||||
}
|
||||
return hostStr, port
|
||||
}
|
||||
|
||||
func extractIPFromMultiaddrForCluster(maddr string) string {
|
||||
parts := strings.Split(maddr, "/")
|
||||
|
||||
@ -893,21 +893,35 @@ func (cm *ClusterManager) GetClusterStatus(ctx context.Context, clusterID string
|
||||
ClusterID: cluster.ID,
|
||||
}
|
||||
|
||||
// Check individual service status
|
||||
// TODO: Actually check each service's health
|
||||
if cluster.Status == ClusterStatusReady {
|
||||
status.RQLiteReady = true
|
||||
status.OlricReady = true
|
||||
status.GatewayReady = true
|
||||
status.DNSReady = true
|
||||
}
|
||||
|
||||
// Get node list
|
||||
// Check individual service status by inspecting cluster nodes
|
||||
nodes, err := cm.getClusterNodes(ctx, clusterID)
|
||||
if err == nil {
|
||||
runningCount := 0
|
||||
hasRQLite := false
|
||||
hasOlric := false
|
||||
hasGateway := false
|
||||
|
||||
for _, node := range nodes {
|
||||
status.Nodes = append(status.Nodes, node.NodeID)
|
||||
if node.Status == NodeStatusRunning {
|
||||
runningCount++
|
||||
}
|
||||
if node.RQLiteHTTPPort > 0 {
|
||||
hasRQLite = true
|
||||
}
|
||||
if node.OlricHTTPPort > 0 {
|
||||
hasOlric = true
|
||||
}
|
||||
if node.GatewayHTTPPort > 0 {
|
||||
hasGateway = true
|
||||
}
|
||||
}
|
||||
|
||||
allRunning := len(nodes) > 0 && runningCount == len(nodes)
|
||||
status.RQLiteReady = allRunning && hasRQLite
|
||||
status.OlricReady = allRunning && hasOlric
|
||||
status.GatewayReady = allRunning && hasGateway
|
||||
status.DNSReady = allRunning
|
||||
}
|
||||
|
||||
if cluster.ErrorMessage != "" {
|
||||
|
||||
@ -6,6 +6,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/client"
|
||||
"github.com/DeBrosOfficial/network/pkg/constants"
|
||||
"github.com/DeBrosOfficial/network/pkg/rqlite"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
@ -176,12 +177,10 @@ func (cns *ClusterNodeSelector) getNodeCapacity(ctx context.Context, nodeID, ipA
|
||||
}
|
||||
|
||||
// Calculate available capacity
|
||||
const (
|
||||
maxDeployments = 100
|
||||
maxPorts = 9900 // User deployment port range
|
||||
maxMemoryMB = 8192 // 8GB
|
||||
maxCPUPercent = 400 // 4 cores
|
||||
)
|
||||
maxDeployments := constants.MaxDeploymentsPerNode
|
||||
maxPorts := constants.MaxPortsPerNode
|
||||
maxMemoryMB := constants.MaxMemoryMB
|
||||
maxCPUPercent := constants.MaxCPUPercent
|
||||
|
||||
availablePorts := maxPorts - allocatedPorts
|
||||
if availablePorts < 0 {
|
||||
@ -363,23 +362,3 @@ func (cns *ClusterNodeSelector) calculateCapacityScore(
|
||||
return totalScore
|
||||
}
|
||||
|
||||
// GetNodeByID retrieves a node's information by ID
|
||||
func (cns *ClusterNodeSelector) GetNodeByID(ctx context.Context, nodeID string) (*nodeInfo, error) {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
var results []nodeInfo
|
||||
query := `SELECT id, ip_address, COALESCE(internal_ip, ip_address) as internal_ip FROM dns_nodes WHERE id = ? LIMIT 1`
|
||||
err := cns.db.Query(internalCtx, &results, query, nodeID)
|
||||
if err != nil {
|
||||
return nil, &ClusterError{
|
||||
Message: "failed to query node",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
if len(results) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return &results[0], nil
|
||||
}
|
||||
|
||||
@ -3,6 +3,7 @@ package namespace
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/client"
|
||||
@ -369,19 +370,5 @@ func isConflictError(err error) bool {
|
||||
return false
|
||||
}
|
||||
errStr := err.Error()
|
||||
return contains(errStr, "UNIQUE") || contains(errStr, "constraint") || contains(errStr, "conflict")
|
||||
}
|
||||
|
||||
// contains checks if a string contains a substring (case-insensitive)
|
||||
func contains(s, substr string) bool {
|
||||
return len(s) >= len(substr) && (s == substr || len(s) > len(substr) && findSubstring(s, substr))
|
||||
}
|
||||
|
||||
func findSubstring(s, substr string) bool {
|
||||
for i := 0; i <= len(s)-len(substr); i++ {
|
||||
if s[i:i+len(substr)] == substr {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
return strings.Contains(errStr, "UNIQUE") || strings.Contains(errStr, "constraint") || strings.Contains(errStr, "conflict")
|
||||
}
|
||||
|
||||
@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@ -269,7 +270,7 @@ func TestContains(t *testing.T) {
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.s+"_"+tt.substr, func(t *testing.T) {
|
||||
result := contains(tt.s, tt.substr)
|
||||
result := strings.Contains(tt.s, tt.substr)
|
||||
if result != tt.expected {
|
||||
t.Errorf("contains(%q, %q) = %v, want %v", tt.s, tt.substr, result, tt.expected)
|
||||
}
|
||||
|
||||
@ -1,25 +1,9 @@
|
||||
package namespace
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
)
|
||||
import "github.com/DeBrosOfficial/network/pkg/wireguard"
|
||||
|
||||
// getWireGuardIP returns the IPv4 address of the wg0 interface.
|
||||
// Used as a fallback when Olric BindAddr is empty or 0.0.0.0.
|
||||
func getWireGuardIP() (string, error) {
|
||||
iface, err := net.InterfaceByName("wg0")
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("wg0 interface not found: %w", err)
|
||||
}
|
||||
addrs, err := iface.Addrs()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to get wg0 addresses: %w", err)
|
||||
}
|
||||
for _, addr := range addrs {
|
||||
if ipnet, ok := addr.(*net.IPNet); ok && ipnet.IP.To4() != nil {
|
||||
return ipnet.IP.String(), nil
|
||||
}
|
||||
}
|
||||
return "", fmt.Errorf("no IPv4 address on wg0")
|
||||
return wireguard.GetIP()
|
||||
}
|
||||
|
||||
@ -11,6 +11,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/logging"
|
||||
"github.com/DeBrosOfficial/network/pkg/wireguard"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
@ -414,20 +415,7 @@ func (n *Node) isNameserverNode(ctx context.Context) bool {
|
||||
|
||||
// getWireGuardIP returns the IPv4 address assigned to the wg0 interface, if any
|
||||
func (n *Node) getWireGuardIP() (string, error) {
|
||||
iface, err := net.InterfaceByName("wg0")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
addrs, err := iface.Addrs()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
for _, addr := range addrs {
|
||||
if ipnet, ok := addr.(*net.IPNet); ok && ipnet.IP.To4() != nil {
|
||||
return ipnet.IP.String(), nil
|
||||
}
|
||||
}
|
||||
return "", fmt.Errorf("no IPv4 address on wg0")
|
||||
return wireguard.GetIP()
|
||||
}
|
||||
|
||||
// getNodeIPAddress attempts to determine the node's external IP address
|
||||
|
||||
@ -47,7 +47,9 @@ func (r *RQLiteManager) waitForMinClusterSizeBeforeStart(ctx context.Context, rq
|
||||
return nil
|
||||
}
|
||||
|
||||
_ = r.discoveryService.TriggerPeerExchange(ctx)
|
||||
if err := r.discoveryService.TriggerPeerExchange(ctx); err != nil {
|
||||
r.logger.Warn("Failed to trigger peer exchange before cluster wait", zap.Error(err))
|
||||
}
|
||||
|
||||
checkInterval := 2 * time.Second
|
||||
for {
|
||||
@ -92,7 +94,9 @@ func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rql
|
||||
return fmt.Errorf("discovery service not available")
|
||||
}
|
||||
|
||||
_ = r.discoveryService.TriggerPeerExchange(ctx)
|
||||
if err := r.discoveryService.TriggerPeerExchange(ctx); err != nil {
|
||||
r.logger.Warn("Failed to trigger peer exchange during pre-start discovery", zap.Error(err))
|
||||
}
|
||||
time.Sleep(1 * time.Second)
|
||||
r.discoveryService.TriggerSync()
|
||||
time.Sleep(2 * time.Second)
|
||||
@ -123,7 +127,9 @@ func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rql
|
||||
zap.Int("discovered_peers", discoveredPeers),
|
||||
zap.Int("min_cluster_size", r.config.MinClusterSize))
|
||||
// Still write peers.json with just ourselves - better than nothing
|
||||
_ = r.discoveryService.ForceWritePeersJSON()
|
||||
if err := r.discoveryService.ForceWritePeersJSON(); err != nil {
|
||||
r.logger.Warn("Failed to write single-node peers.json fallback", zap.Error(err))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -137,8 +143,12 @@ func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rql
|
||||
}
|
||||
|
||||
if ourLogIndex == 0 && maxPeerIndex > 0 {
|
||||
_ = r.clearRaftState(rqliteDataDir)
|
||||
_ = r.discoveryService.ForceWritePeersJSON()
|
||||
if err := r.clearRaftState(rqliteDataDir); err != nil {
|
||||
r.logger.Warn("Failed to clear raft state during pre-start discovery", zap.Error(err))
|
||||
}
|
||||
if err := r.discoveryService.ForceWritePeersJSON(); err != nil {
|
||||
r.logger.Warn("Failed to write peers.json after clearing raft state", zap.Error(err))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -150,7 +160,9 @@ func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rql
|
||||
|
||||
// recoverCluster restarts RQLite using peers.json
|
||||
func (r *RQLiteManager) recoverCluster(ctx context.Context, peersJSONPath string) error {
|
||||
_ = r.Stop()
|
||||
if err := r.Stop(); err != nil {
|
||||
r.logger.Warn("Failed to stop RQLite during cluster recovery", zap.Error(err))
|
||||
}
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
rqliteDataDir, err := r.rqliteDataDirPath()
|
||||
@ -187,10 +199,14 @@ func (r *RQLiteManager) recoverFromSplitBrain(ctx context.Context) error {
|
||||
}
|
||||
|
||||
if ourIndex == 0 && maxPeerIndex > 0 {
|
||||
_ = r.clearRaftState(rqliteDataDir)
|
||||
if err := r.clearRaftState(rqliteDataDir); err != nil {
|
||||
r.logger.Warn("Failed to clear raft state during split-brain recovery", zap.Error(err))
|
||||
}
|
||||
r.discoveryService.TriggerPeerExchange(ctx)
|
||||
time.Sleep(1 * time.Second)
|
||||
_ = r.discoveryService.ForceWritePeersJSON()
|
||||
if err := r.discoveryService.ForceWritePeersJSON(); err != nil {
|
||||
r.logger.Warn("Failed to write peers.json during split-brain recovery", zap.Error(err))
|
||||
}
|
||||
return r.recoverCluster(ctx, filepath.Join(rqliteDataDir, "raft", "peers.json"))
|
||||
}
|
||||
|
||||
@ -265,7 +281,9 @@ func (r *RQLiteManager) startHealthMonitoring(ctx context.Context) {
|
||||
return
|
||||
case <-ticker.C:
|
||||
if r.isInSplitBrainState() {
|
||||
_ = r.recoverFromSplitBrain(ctx)
|
||||
if err := r.recoverFromSplitBrain(ctx); err != nil {
|
||||
r.logger.Warn("Split-brain recovery attempt failed", zap.Error(err))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
85
pkg/serverless/cache/module_cache.go
vendored
85
pkg/serverless/cache/module_cache.go
vendored
@ -3,14 +3,21 @@ package cache
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/tetratelabs/wazero"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// cacheEntry wraps a compiled module with access tracking for LRU eviction.
|
||||
type cacheEntry struct {
|
||||
module wazero.CompiledModule
|
||||
lastAccessed time.Time
|
||||
}
|
||||
|
||||
// ModuleCache manages compiled WASM module caching.
|
||||
type ModuleCache struct {
|
||||
modules map[string]wazero.CompiledModule
|
||||
modules map[string]*cacheEntry
|
||||
mu sync.RWMutex
|
||||
capacity int
|
||||
logger *zap.Logger
|
||||
@ -19,7 +26,7 @@ type ModuleCache struct {
|
||||
// NewModuleCache creates a new ModuleCache.
|
||||
func NewModuleCache(capacity int, logger *zap.Logger) *ModuleCache {
|
||||
return &ModuleCache{
|
||||
modules: make(map[string]wazero.CompiledModule),
|
||||
modules: make(map[string]*cacheEntry),
|
||||
capacity: capacity,
|
||||
logger: logger,
|
||||
}
|
||||
@ -27,15 +34,20 @@ func NewModuleCache(capacity int, logger *zap.Logger) *ModuleCache {
|
||||
|
||||
// Get retrieves a compiled module from the cache.
|
||||
func (c *ModuleCache) Get(wasmCID string) (wazero.CompiledModule, bool) {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
module, exists := c.modules[wasmCID]
|
||||
return module, exists
|
||||
entry, exists := c.modules[wasmCID]
|
||||
if !exists {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
entry.lastAccessed = time.Now()
|
||||
return entry.module, true
|
||||
}
|
||||
|
||||
// Set stores a compiled module in the cache.
|
||||
// If the cache is full, it evicts the oldest module.
|
||||
// If the cache is full, it evicts the least recently used module.
|
||||
func (c *ModuleCache) Set(wasmCID string, module wazero.CompiledModule) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
@ -50,7 +62,10 @@ func (c *ModuleCache) Set(wasmCID string, module wazero.CompiledModule) {
|
||||
c.evictOldest()
|
||||
}
|
||||
|
||||
c.modules[wasmCID] = module
|
||||
c.modules[wasmCID] = &cacheEntry{
|
||||
module: module,
|
||||
lastAccessed: time.Now(),
|
||||
}
|
||||
|
||||
c.logger.Debug("Module cached",
|
||||
zap.String("wasm_cid", wasmCID),
|
||||
@ -63,8 +78,8 @@ func (c *ModuleCache) Delete(ctx context.Context, wasmCID string) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
if module, exists := c.modules[wasmCID]; exists {
|
||||
_ = module.Close(ctx)
|
||||
if entry, exists := c.modules[wasmCID]; exists {
|
||||
_ = entry.module.Close(ctx)
|
||||
delete(c.modules, wasmCID)
|
||||
c.logger.Debug("Module removed from cache", zap.String("wasm_cid", wasmCID))
|
||||
}
|
||||
@ -97,8 +112,8 @@ func (c *ModuleCache) Clear(ctx context.Context) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
for cid, module := range c.modules {
|
||||
if err := module.Close(ctx); err != nil {
|
||||
for cid, entry := range c.modules {
|
||||
if err := entry.module.Close(ctx); err != nil {
|
||||
c.logger.Warn("Failed to close cached module during clear",
|
||||
zap.String("cid", cid),
|
||||
zap.Error(err),
|
||||
@ -106,7 +121,7 @@ func (c *ModuleCache) Clear(ctx context.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
c.modules = make(map[string]wazero.CompiledModule)
|
||||
c.modules = make(map[string]*cacheEntry)
|
||||
c.logger.Debug("Module cache cleared")
|
||||
}
|
||||
|
||||
@ -118,16 +133,23 @@ func (c *ModuleCache) GetStats() (size int, capacity int) {
|
||||
return len(c.modules), c.capacity
|
||||
}
|
||||
|
||||
// evictOldest removes the oldest module from cache.
|
||||
// evictOldest removes the least recently accessed module from cache.
|
||||
// Must be called with mu held.
|
||||
func (c *ModuleCache) evictOldest() {
|
||||
// Simple LRU: just remove the first one we find
|
||||
// In production, you'd want proper LRU tracking
|
||||
for cid, module := range c.modules {
|
||||
_ = module.Close(context.Background())
|
||||
delete(c.modules, cid)
|
||||
c.logger.Debug("Evicted module from cache", zap.String("wasm_cid", cid))
|
||||
break
|
||||
var oldestCID string
|
||||
var oldestTime time.Time
|
||||
|
||||
for cid, entry := range c.modules {
|
||||
if oldestCID == "" || entry.lastAccessed.Before(oldestTime) {
|
||||
oldestCID = cid
|
||||
oldestTime = entry.lastAccessed
|
||||
}
|
||||
}
|
||||
|
||||
if oldestCID != "" {
|
||||
_ = c.modules[oldestCID].module.Close(context.Background())
|
||||
delete(c.modules, oldestCID)
|
||||
c.logger.Debug("Evicted LRU module from cache", zap.String("wasm_cid", oldestCID))
|
||||
}
|
||||
}
|
||||
|
||||
@ -135,12 +157,13 @@ func (c *ModuleCache) evictOldest() {
|
||||
// The compute function is called with the lock released to avoid blocking.
|
||||
func (c *ModuleCache) GetOrCompute(wasmCID string, compute func() (wazero.CompiledModule, error)) (wazero.CompiledModule, error) {
|
||||
// Try to get from cache first
|
||||
c.mu.RLock()
|
||||
if module, exists := c.modules[wasmCID]; exists {
|
||||
c.mu.RUnlock()
|
||||
return module, nil
|
||||
c.mu.Lock()
|
||||
if entry, exists := c.modules[wasmCID]; exists {
|
||||
entry.lastAccessed = time.Now()
|
||||
c.mu.Unlock()
|
||||
return entry.module, nil
|
||||
}
|
||||
c.mu.RUnlock()
|
||||
c.mu.Unlock()
|
||||
|
||||
// Compute the module (without holding the lock)
|
||||
module, err := compute()
|
||||
@ -153,9 +176,10 @@ func (c *ModuleCache) GetOrCompute(wasmCID string, compute func() (wazero.Compil
|
||||
defer c.mu.Unlock()
|
||||
|
||||
// Double-check (another goroutine might have added it)
|
||||
if existingModule, exists := c.modules[wasmCID]; exists {
|
||||
if entry, exists := c.modules[wasmCID]; exists {
|
||||
_ = module.Close(context.Background()) // Discard our compilation
|
||||
return existingModule, nil
|
||||
entry.lastAccessed = time.Now()
|
||||
return entry.module, nil
|
||||
}
|
||||
|
||||
// Evict if cache is full
|
||||
@ -163,7 +187,10 @@ func (c *ModuleCache) GetOrCompute(wasmCID string, compute func() (wazero.Compil
|
||||
c.evictOldest()
|
||||
}
|
||||
|
||||
c.modules[wasmCID] = module
|
||||
c.modules[wasmCID] = &cacheEntry{
|
||||
module: module,
|
||||
lastAccessed: time.Now(),
|
||||
}
|
||||
|
||||
c.logger.Debug("Module compiled and cached",
|
||||
zap.String("wasm_cid", wasmCID),
|
||||
|
||||
@ -81,36 +81,3 @@ func (m *ModuleLifecycle) ValidateModule(module wazero.CompiledModule) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// InstantiateModule creates a module instance for execution.
|
||||
// Note: This method is currently unused but kept for potential future use.
|
||||
func (m *ModuleLifecycle) InstantiateModule(ctx context.Context, compiled wazero.CompiledModule, config wazero.ModuleConfig) error {
|
||||
if compiled == nil {
|
||||
return fmt.Errorf("compiled module is nil")
|
||||
}
|
||||
|
||||
instance, err := m.runtime.InstantiateModule(ctx, compiled, config)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to instantiate module: %w", err)
|
||||
}
|
||||
|
||||
// Close immediately - this is just for validation
|
||||
_ = instance.Close(ctx)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ModuleInfo provides information about a compiled module.
|
||||
type ModuleInfo struct {
|
||||
CID string
|
||||
SizeBytes int
|
||||
Compiled bool
|
||||
}
|
||||
|
||||
// GetModuleInfo returns information about a module.
|
||||
func (m *ModuleLifecycle) GetModuleInfo(wasmCID string, wasmBytes []byte, isCompiled bool) *ModuleInfo {
|
||||
return &ModuleInfo{
|
||||
CID: wasmCID,
|
||||
SizeBytes: len(wasmBytes),
|
||||
Compiled: isCompiled,
|
||||
}
|
||||
}
|
||||
|
||||
@ -3,6 +3,7 @@ package serverless
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
@ -249,7 +250,7 @@ func (i *Invoker) isRetryable(err error) bool {
|
||||
|
||||
// Retry execution errors (could be transient)
|
||||
var execErr *ExecutionError
|
||||
if ok := errorAs(err, &execErr); ok {
|
||||
if errors.As(err, &execErr) {
|
||||
return true
|
||||
}
|
||||
|
||||
@ -347,22 +348,6 @@ type DLQMessage struct {
|
||||
CallerWallet string `json:"caller_wallet,omitempty"`
|
||||
}
|
||||
|
||||
// errorAs is a helper to avoid import of errors package.
|
||||
func errorAs(err error, target interface{}) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
// Simple type assertion for our custom error types
|
||||
switch t := target.(type) {
|
||||
case **ExecutionError:
|
||||
if e, ok := err.(*ExecutionError); ok {
|
||||
*t = e
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Batch Invocation (for future use)
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
@ -438,27 +438,6 @@ func (r *Registry) uploadWASM(ctx context.Context, wasmBytes []byte, name string
|
||||
return resp.Cid, nil
|
||||
}
|
||||
|
||||
// getLatestVersion returns the latest version number for a function.
|
||||
func (r *Registry) getLatestVersion(ctx context.Context, namespace, name string) (int, error) {
|
||||
query := `SELECT MAX(version) FROM functions WHERE namespace = ? AND name = ?`
|
||||
|
||||
var maxVersion sql.NullInt64
|
||||
var results []struct {
|
||||
MaxVersion sql.NullInt64 `db:"max(version)"`
|
||||
}
|
||||
|
||||
if err := r.db.Query(ctx, &results, query, namespace, name); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if len(results) == 0 || !results[0].MaxVersion.Valid {
|
||||
return 0, ErrFunctionNotFound
|
||||
}
|
||||
|
||||
maxVersion = results[0].MaxVersion
|
||||
return int(maxVersion.Int64), nil
|
||||
}
|
||||
|
||||
// getByNameInternal retrieves a function by name regardless of status.
|
||||
func (r *Registry) getByNameInternal(ctx context.Context, namespace, name string) (*Function, error) {
|
||||
namespace = strings.TrimSpace(namespace)
|
||||
|
||||
@ -82,12 +82,9 @@ func GetTLSConfig() *tls.Config {
|
||||
MinVersion: tls.VersionTLS12,
|
||||
}
|
||||
|
||||
// If we have a CA cert pool, use it
|
||||
// If we have a CA cert pool, use it for verifying self-signed certs
|
||||
if caCertPool != nil {
|
||||
config.RootCAs = caCertPool
|
||||
} else if len(trustedDomains) > 0 {
|
||||
// Fallback: skip verification if trusted domains are configured but no CA pool
|
||||
config.InsecureSkipVerify = true
|
||||
}
|
||||
|
||||
return config
|
||||
@ -103,11 +100,12 @@ func NewHTTPClient(timeout time.Duration) *http.Client {
|
||||
}
|
||||
}
|
||||
|
||||
// NewHTTPClientForDomain creates an HTTP client configured for a specific domain
|
||||
// NewHTTPClientForDomain creates an HTTP client configured for a specific domain.
|
||||
// Only skips TLS verification for explicitly trusted domains when no CA cert is available.
|
||||
func NewHTTPClientForDomain(timeout time.Duration, hostname string) *http.Client {
|
||||
tlsConfig := GetTLSConfig()
|
||||
|
||||
// If this domain is in trusted list and we don't have a CA pool, allow insecure
|
||||
// Only skip TLS for explicitly trusted domains when no CA pool is configured
|
||||
if caCertPool == nil && ShouldSkipTLSVerify(hostname) {
|
||||
tlsConfig.InsecureSkipVerify = true
|
||||
}
|
||||
|
||||
24
pkg/wireguard/ip.go
Normal file
24
pkg/wireguard/ip.go
Normal file
@ -0,0 +1,24 @@
|
||||
package wireguard
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
)
|
||||
|
||||
// GetIP returns the IPv4 address of the wg0 interface.
|
||||
func GetIP() (string, error) {
|
||||
iface, err := net.InterfaceByName("wg0")
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("wg0 interface not found: %w", err)
|
||||
}
|
||||
addrs, err := iface.Addrs()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to get wg0 addresses: %w", err)
|
||||
}
|
||||
for _, addr := range addrs {
|
||||
if ipnet, ok := addr.(*net.IPNet); ok && ipnet.IP.To4() != nil {
|
||||
return ipnet.IP.String(), nil
|
||||
}
|
||||
}
|
||||
return "", fmt.Errorf("no IPv4 address on wg0")
|
||||
}
|
||||
@ -1,298 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# block-node.sh - Temporarily block network access to a gateway node (local or remote)
|
||||
# Usage:
|
||||
# Local: ./scripts/block-node.sh <node_number> <duration_seconds>
|
||||
# Remote: ./scripts/block-node.sh --remote <remote_node_number> <duration_seconds>
|
||||
# Example:
|
||||
# ./scripts/block-node.sh 1 60 # Block local node-1 (port 6001) for 60 seconds
|
||||
# ./scripts/block-node.sh --remote 2 120 # Block remote node-2 for 120 seconds
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Remote node configurations - loaded from config file
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
CONFIG_FILE="$SCRIPT_DIR/remote-nodes.conf"
|
||||
|
||||
# Function to get remote node config
|
||||
get_remote_node_config() {
|
||||
local node_num="$1"
|
||||
local field="$2" # "user_host" or "password"
|
||||
|
||||
if [ ! -f "$CONFIG_FILE" ]; then
|
||||
echo ""
|
||||
return 1
|
||||
fi
|
||||
|
||||
while IFS='|' read -r num user_host password || [ -n "$num" ]; do
|
||||
# Skip comments and empty lines
|
||||
[[ "$num" =~ ^#.*$ ]] || [[ -z "$num" ]] && continue
|
||||
# Trim whitespace
|
||||
num=$(echo "$num" | xargs)
|
||||
user_host=$(echo "$user_host" | xargs)
|
||||
password=$(echo "$password" | xargs)
|
||||
|
||||
if [ "$num" = "$node_num" ]; then
|
||||
if [ "$field" = "user_host" ]; then
|
||||
echo "$user_host"
|
||||
elif [ "$field" = "password" ]; then
|
||||
echo "$password"
|
||||
fi
|
||||
return 0
|
||||
fi
|
||||
done < "$CONFIG_FILE"
|
||||
|
||||
echo ""
|
||||
return 1
|
||||
}
|
||||
|
||||
# Display usage
|
||||
usage() {
|
||||
echo -e "${RED}Error:${NC} Invalid arguments"
|
||||
echo ""
|
||||
echo -e "${BLUE}Usage:${NC}"
|
||||
echo " $0 <node_number> <duration_seconds> # Local mode"
|
||||
echo " $0 --remote <remote_node_number> <duration_seconds> # Remote mode"
|
||||
echo ""
|
||||
echo -e "${GREEN}Local Mode Examples:${NC}"
|
||||
echo " $0 1 60 # Block local node-1 (port 6001) for 60 seconds"
|
||||
echo " $0 2 120 # Block local node-2 (port 6002) for 120 seconds"
|
||||
echo ""
|
||||
echo -e "${GREEN}Remote Mode Examples:${NC}"
|
||||
echo " $0 --remote 1 60 # Block remote node-1 (51.83.128.181) for 60 seconds"
|
||||
echo " $0 --remote 3 120 # Block remote node-3 (83.171.248.66) for 120 seconds"
|
||||
echo ""
|
||||
echo -e "${YELLOW}Local Node Mapping:${NC}"
|
||||
echo " Node 1 -> Port 6001"
|
||||
echo " Node 2 -> Port 6002"
|
||||
echo " Node 3 -> Port 6003"
|
||||
echo " Node 4 -> Port 6004"
|
||||
echo " Node 5 -> Port 6005"
|
||||
echo ""
|
||||
echo -e "${YELLOW}Remote Node Mapping:${NC}"
|
||||
echo " Remote 1 -> ubuntu@51.83.128.181"
|
||||
echo " Remote 2 -> root@194.61.28.7"
|
||||
echo " Remote 3 -> root@83.171.248.66"
|
||||
echo " Remote 4 -> root@62.72.44.87"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Parse arguments
|
||||
REMOTE_MODE=false
|
||||
if [ $# -eq 3 ] && [ "$1" == "--remote" ]; then
|
||||
REMOTE_MODE=true
|
||||
NODE_NUM="$2"
|
||||
DURATION="$3"
|
||||
elif [ $# -eq 2 ]; then
|
||||
NODE_NUM="$1"
|
||||
DURATION="$2"
|
||||
else
|
||||
usage
|
||||
fi
|
||||
|
||||
# Validate duration
|
||||
if ! [[ "$DURATION" =~ ^[0-9]+$ ]] || [ "$DURATION" -le 0 ]; then
|
||||
echo -e "${RED}Error:${NC} Duration must be a positive integer"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Calculate port (local nodes use 6001-6005, remote nodes use 80 and 443)
|
||||
if [ "$REMOTE_MODE" = true ]; then
|
||||
# Remote nodes: block standard HTTP/HTTPS ports
|
||||
PORTS="80 443"
|
||||
else
|
||||
# Local nodes: block the specific gateway port
|
||||
PORT=$((6000 + NODE_NUM))
|
||||
fi
|
||||
|
||||
# Function to block ports on remote server
|
||||
block_remote_node() {
|
||||
local node_num="$1"
|
||||
local duration="$2"
|
||||
local ports="$3" # Can be space-separated list like "80 443"
|
||||
|
||||
# Validate remote node number
|
||||
if ! [[ "$node_num" =~ ^[1-4]$ ]]; then
|
||||
echo -e "${RED}Error:${NC} Remote node number must be between 1 and 4"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get credentials from config file
|
||||
local user_host=$(get_remote_node_config "$node_num" "user_host")
|
||||
local password=$(get_remote_node_config "$node_num" "password")
|
||||
|
||||
if [ -z "$user_host" ] || [ -z "$password" ]; then
|
||||
echo -e "${RED}Error:${NC} Configuration for remote node $node_num not found in $CONFIG_FILE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
local host="${user_host##*@}"
|
||||
|
||||
echo -e "${BLUE}=== Remote Network Blocking Tool ===${NC}"
|
||||
echo -e "Remote Node: ${GREEN}$node_num${NC} ($user_host)"
|
||||
echo -e "Ports: ${GREEN}$ports${NC}"
|
||||
echo -e "Duration: ${GREEN}$duration seconds${NC}"
|
||||
echo ""
|
||||
|
||||
# Check if sshpass is installed
|
||||
if ! command -v sshpass &> /dev/null; then
|
||||
echo -e "${RED}Error:${NC} sshpass is not installed. Install it first:"
|
||||
echo -e " ${YELLOW}macOS:${NC} brew install hudochenkov/sshpass/sshpass"
|
||||
echo -e " ${YELLOW}Ubuntu/Debian:${NC} sudo apt-get install sshpass"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# SSH options - force password authentication only to avoid "too many auth failures"
|
||||
SSH_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR -o PreferredAuthentications=password -o PubkeyAuthentication=no -o NumberOfPasswordPrompts=1"
|
||||
|
||||
echo -e "${YELLOW}Connecting to remote server...${NC}"
|
||||
|
||||
# Test connection
|
||||
if ! sshpass -p "$password" ssh $SSH_OPTS "$user_host" "echo 'Connected successfully' > /dev/null"; then
|
||||
echo -e "${RED}Error:${NC} Failed to connect to $user_host"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo -e "${GREEN}✓${NC} Connected to $host"
|
||||
|
||||
# Install iptables rules on remote server
|
||||
echo -e "${YELLOW}Installing iptables rules on remote server...${NC}"
|
||||
|
||||
# Build iptables commands for all ports
|
||||
BLOCK_CMDS=""
|
||||
for port in $ports; do
|
||||
BLOCK_CMDS="${BLOCK_CMDS}iptables -I INPUT -p tcp --dport $port -j DROP 2>/dev/null || true; "
|
||||
BLOCK_CMDS="${BLOCK_CMDS}iptables -I OUTPUT -p tcp --sport $port -j DROP 2>/dev/null || true; "
|
||||
done
|
||||
BLOCK_CMDS="${BLOCK_CMDS}echo 'Rules installed'"
|
||||
|
||||
if ! sshpass -p "$password" ssh $SSH_OPTS "$user_host" "$BLOCK_CMDS"; then
|
||||
echo -e "${RED}Error:${NC} Failed to install iptables rules"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo -e "${GREEN}✓${NC} Ports $ports are now blocked on $host"
|
||||
echo -e "${YELLOW}Waiting $duration seconds...${NC}"
|
||||
echo ""
|
||||
|
||||
# Show countdown
|
||||
for ((i=duration; i>0; i--)); do
|
||||
printf "\r${BLUE}Time remaining: %3d seconds${NC}" "$i"
|
||||
sleep 1
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo ""
|
||||
echo -e "${YELLOW}Removing iptables rules from remote server...${NC}"
|
||||
|
||||
# Build iptables removal commands for all ports
|
||||
UNBLOCK_CMDS=""
|
||||
for port in $ports; do
|
||||
UNBLOCK_CMDS="${UNBLOCK_CMDS}iptables -D INPUT -p tcp --dport $port -j DROP 2>/dev/null || true; "
|
||||
UNBLOCK_CMDS="${UNBLOCK_CMDS}iptables -D OUTPUT -p tcp --sport $port -j DROP 2>/dev/null || true; "
|
||||
done
|
||||
UNBLOCK_CMDS="${UNBLOCK_CMDS}echo 'Rules removed'"
|
||||
|
||||
if ! sshpass -p "$password" ssh $SSH_OPTS "$user_host" "$UNBLOCK_CMDS"; then
|
||||
echo -e "${YELLOW}Warning:${NC} Failed to remove some iptables rules. You may need to clean up manually."
|
||||
else
|
||||
echo -e "${GREEN}✓${NC} Ports $ports are now accessible again on $host"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo -e "${GREEN}=== Done! ===${NC}"
|
||||
echo -e "Remote node ${GREEN}$node_num${NC} ($host) was unreachable for $duration seconds and is now accessible again."
|
||||
}
|
||||
|
||||
# Function to block port locally using process pause (SIGSTOP)
|
||||
block_local_node() {
|
||||
local node_num="$1"
|
||||
local duration="$2"
|
||||
local port="$3"
|
||||
|
||||
# Validate node number
|
||||
if ! [[ "$node_num" =~ ^[1-5]$ ]]; then
|
||||
echo -e "${RED}Error:${NC} Local node number must be between 1 and 5"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo -e "${BLUE}=== Local Network Blocking Tool ===${NC}"
|
||||
echo -e "Node: ${GREEN}node-$node_num${NC}"
|
||||
echo -e "Port: ${GREEN}$port${NC}"
|
||||
echo -e "Duration: ${GREEN}$duration seconds${NC}"
|
||||
echo -e "Method: ${GREEN}Process Pause (SIGSTOP/SIGCONT)${NC}"
|
||||
echo ""
|
||||
|
||||
# Find the process listening on the port
|
||||
echo -e "${YELLOW}Finding process listening on port $port...${NC}"
|
||||
|
||||
# macOS uses different tools than Linux
|
||||
if [[ "$(uname -s)" == "Darwin" ]]; then
|
||||
# macOS: use lsof
|
||||
PID=$(lsof -ti :$port 2>/dev/null | head -1 || echo "")
|
||||
else
|
||||
# Linux: use ss or netstat
|
||||
if command -v ss &> /dev/null; then
|
||||
PID=$(ss -tlnp | grep ":$port " | grep -oP 'pid=\K[0-9]+' | head -1 || echo "")
|
||||
else
|
||||
PID=$(netstat -tlnp 2>/dev/null | grep ":$port " | awk '{print $7}' | cut -d'/' -f1 | head -1 || echo "")
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -z "$PID" ]; then
|
||||
echo -e "${RED}Error:${NC} No process found listening on port $port"
|
||||
echo -e "Make sure node-$node_num is running first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get process name
|
||||
PROCESS_NAME=$(ps -p $PID -o comm= 2>/dev/null || echo "unknown")
|
||||
|
||||
echo -e "${GREEN}✓${NC} Found process: ${BLUE}$PROCESS_NAME${NC} (PID: ${BLUE}$PID${NC})"
|
||||
echo ""
|
||||
|
||||
# Pause the process
|
||||
echo -e "${YELLOW}Pausing process (SIGSTOP)...${NC}"
|
||||
if ! kill -STOP $PID 2>/dev/null; then
|
||||
echo -e "${RED}Error:${NC} Failed to pause process. You may need sudo privileges."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo -e "${GREEN}✓${NC} Process paused - node-$node_num is now unreachable"
|
||||
echo -e "${YELLOW}Waiting $duration seconds...${NC}"
|
||||
echo ""
|
||||
|
||||
# Show countdown
|
||||
for ((i=duration; i>0; i--)); do
|
||||
printf "\r${BLUE}Time remaining: %3d seconds${NC}" "$i"
|
||||
sleep 1
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo ""
|
||||
|
||||
# Resume the process
|
||||
echo -e "${YELLOW}Resuming process (SIGCONT)...${NC}"
|
||||
if ! kill -CONT $PID 2>/dev/null; then
|
||||
echo -e "${YELLOW}Warning:${NC} Failed to resume process. It may have been terminated."
|
||||
else
|
||||
echo -e "${GREEN}✓${NC} Process resumed - node-$node_num is now accessible again"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo -e "${GREEN}=== Done! ===${NC}"
|
||||
echo -e "Local node ${GREEN}node-$node_num${NC} was unreachable for $duration seconds and is now accessible again."
|
||||
}
|
||||
|
||||
# Main execution
|
||||
if [ "$REMOTE_MODE" = true ]; then
|
||||
block_remote_node "$NODE_NUM" "$DURATION" "$PORTS"
|
||||
else
|
||||
block_local_node "$NODE_NUM" "$DURATION" "$PORT"
|
||||
fi
|
||||
@ -1,112 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Build custom CoreDNS binary with RQLite plugin
|
||||
# This script compiles CoreDNS with the custom RQLite plugin
|
||||
|
||||
COREDNS_VERSION="1.11.1"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
||||
COREDNS_DIR="/tmp/coredns-build"
|
||||
|
||||
echo "Building CoreDNS v${COREDNS_VERSION} with RQLite plugin..."
|
||||
|
||||
# Clean previous build
|
||||
rm -rf "$COREDNS_DIR"
|
||||
mkdir -p "$COREDNS_DIR"
|
||||
|
||||
# Clone CoreDNS
|
||||
echo "Cloning CoreDNS..."
|
||||
cd "$COREDNS_DIR"
|
||||
git clone --depth 1 --branch v${COREDNS_VERSION} https://github.com/coredns/coredns.git
|
||||
cd coredns
|
||||
|
||||
# Create plugin.cfg with RQLite plugin
|
||||
echo "Configuring plugins..."
|
||||
cat > plugin.cfg <<EOF
|
||||
# Standard CoreDNS plugins
|
||||
metadata:metadata
|
||||
cancel:cancel
|
||||
tls:tls
|
||||
reload:reload
|
||||
nsid:nsid
|
||||
bufsize:bufsize
|
||||
root:root
|
||||
bind:bind
|
||||
debug:debug
|
||||
trace:trace
|
||||
ready:ready
|
||||
health:health
|
||||
pprof:pprof
|
||||
prometheus:metrics
|
||||
errors:errors
|
||||
log:log
|
||||
dnstap:dnstap
|
||||
local:local
|
||||
dns64:dns64
|
||||
acl:acl
|
||||
any:any
|
||||
chaos:chaos
|
||||
loadbalance:loadbalance
|
||||
cache:cache
|
||||
rewrite:rewrite
|
||||
header:header
|
||||
dnssec:dnssec
|
||||
autopath:autopath
|
||||
minimal:minimal
|
||||
template:template
|
||||
transfer:transfer
|
||||
hosts:hosts
|
||||
route53:route53
|
||||
azure:azure
|
||||
clouddns:clouddns
|
||||
k8s_external:k8s_external
|
||||
kubernetes:kubernetes
|
||||
file:file
|
||||
auto:auto
|
||||
secondary:secondary
|
||||
loop:loop
|
||||
forward:forward
|
||||
grpc:grpc
|
||||
erratic:erratic
|
||||
whoami:whoami
|
||||
on:github.com/coredns/caddy/onevent
|
||||
sign:sign
|
||||
view:view
|
||||
|
||||
# Response Rate Limiting (DNS amplification protection)
|
||||
rrl:rrl
|
||||
|
||||
# Custom RQLite plugin
|
||||
rqlite:github.com/DeBrosOfficial/network/pkg/coredns/rqlite
|
||||
EOF
|
||||
|
||||
# Copy RQLite plugin to CoreDNS
|
||||
echo "Copying RQLite plugin..."
|
||||
mkdir -p plugin/rqlite
|
||||
cp -r "$PROJECT_ROOT/pkg/coredns/rqlite/"* plugin/rqlite/
|
||||
|
||||
# Update go.mod to include our dependencies
|
||||
echo "Updating dependencies..."
|
||||
go get github.com/rqlite/rqlite-go@latest
|
||||
go get github.com/coredns/coredns@v${COREDNS_VERSION}
|
||||
go mod tidy
|
||||
|
||||
# Build CoreDNS
|
||||
echo "Building CoreDNS binary..."
|
||||
make
|
||||
|
||||
# Copy binary to project
|
||||
echo "Copying binary to project..."
|
||||
cp coredns "$PROJECT_ROOT/bin/coredns-custom"
|
||||
chmod +x "$PROJECT_ROOT/bin/coredns-custom"
|
||||
|
||||
echo ""
|
||||
echo "✅ CoreDNS built successfully!"
|
||||
echo "Binary location: $PROJECT_ROOT/bin/coredns-custom"
|
||||
echo ""
|
||||
echo "To deploy:"
|
||||
echo " 1. Copy binary to /usr/local/bin/coredns on each nameserver node"
|
||||
echo " 2. Copy configs/coredns/Corefile to /etc/coredns/Corefile"
|
||||
echo " 3. Start CoreDNS: sudo systemctl start coredns"
|
||||
echo ""
|
||||
@ -1,379 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Production Cluster Health Check Script
|
||||
# Tests RQLite, IPFS, and IPFS Cluster connectivity and replication
|
||||
|
||||
# Note: We don't use 'set -e' here because we want to continue testing even if individual checks fail
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Node IPs - Update these if needed
|
||||
BOOTSTRAP="${BOOTSTRAP:-51.83.128.181}"
|
||||
NODE1="${NODE1:-57.128.223.92}"
|
||||
NODE2="${NODE2:-185.185.83.89}"
|
||||
|
||||
ALL_NODES=($BOOTSTRAP $NODE1 $NODE2)
|
||||
|
||||
# Counters
|
||||
PASSED=0
|
||||
FAILED=0
|
||||
WARNINGS=0
|
||||
|
||||
# Helper functions
|
||||
print_header() {
|
||||
echo ""
|
||||
echo -e "${BLUE}========================================${NC}"
|
||||
echo -e "${BLUE}$1${NC}"
|
||||
echo -e "${BLUE}========================================${NC}"
|
||||
}
|
||||
|
||||
print_test() {
|
||||
echo -e "${YELLOW}▶ $1${NC}"
|
||||
}
|
||||
|
||||
print_pass() {
|
||||
echo -e "${GREEN}✓ $1${NC}"
|
||||
PASSED=$((PASSED + 1))
|
||||
}
|
||||
|
||||
print_fail() {
|
||||
echo -e "${RED}✗ $1${NC}"
|
||||
FAILED=$((FAILED + 1))
|
||||
}
|
||||
|
||||
print_warn() {
|
||||
echo -e "${YELLOW}⚠ $1${NC}"
|
||||
WARNINGS=$((WARNINGS + 1))
|
||||
}
|
||||
|
||||
print_info() {
|
||||
echo -e " $1"
|
||||
}
|
||||
|
||||
# Test functions
|
||||
test_rqlite_status() {
|
||||
print_header "1. RQLITE CLUSTER STATUS"
|
||||
|
||||
local leader_found=false
|
||||
local follower_count=0
|
||||
local commit_indices=()
|
||||
|
||||
for i in "${!ALL_NODES[@]}"; do
|
||||
local node="${ALL_NODES[$i]}"
|
||||
print_test "Testing RQLite on $node"
|
||||
|
||||
if ! response=$(curl -s --max-time 5 http://$node:5001/status 2>/dev/null); then
|
||||
print_fail "Cannot connect to RQLite on $node:5001"
|
||||
continue
|
||||
fi
|
||||
|
||||
local state=$(echo "$response" | jq -r '.store.raft.state // "unknown"')
|
||||
local num_peers=$(echo "$response" | jq -r '.store.raft.num_peers // 0')
|
||||
local commit_index=$(echo "$response" | jq -r '.store.raft.commit_index // 0')
|
||||
local last_contact=$(echo "$response" | jq -r '.store.raft.last_contact // "N/A"')
|
||||
local config=$(echo "$response" | jq -r '.store.raft.latest_configuration // "[]"')
|
||||
local node_count=$(echo "$config" | grep -o "Address" | wc -l | tr -d ' ')
|
||||
|
||||
commit_indices+=($commit_index)
|
||||
|
||||
print_info "State: $state | Peers: $num_peers | Commit Index: $commit_index | Cluster Nodes: $node_count"
|
||||
|
||||
# Check state
|
||||
if [ "$state" = "Leader" ]; then
|
||||
leader_found=true
|
||||
print_pass "Node $node is the Leader"
|
||||
elif [ "$state" = "Follower" ]; then
|
||||
follower_count=$((follower_count + 1))
|
||||
# Check last contact
|
||||
if [ "$last_contact" != "N/A" ] && [ "$last_contact" != "0" ]; then
|
||||
print_pass "Node $node is a Follower (last contact: $last_contact)"
|
||||
else
|
||||
print_warn "Node $node is Follower but last_contact is $last_contact"
|
||||
fi
|
||||
else
|
||||
print_fail "Node $node has unexpected state: $state"
|
||||
fi
|
||||
|
||||
# Check peer count
|
||||
if [ "$num_peers" = "2" ]; then
|
||||
print_pass "Node $node has correct peer count: 2"
|
||||
else
|
||||
print_fail "Node $node has incorrect peer count: $num_peers (expected 2)"
|
||||
fi
|
||||
|
||||
# Check cluster configuration
|
||||
if [ "$node_count" = "3" ]; then
|
||||
print_pass "Node $node sees all 3 cluster members"
|
||||
else
|
||||
print_fail "Node $node only sees $node_count cluster members (expected 3)"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
done
|
||||
|
||||
# Check for exactly 1 leader
|
||||
if [ "$leader_found" = true ] && [ "$follower_count" = "2" ]; then
|
||||
print_pass "Cluster has 1 Leader and 2 Followers ✓"
|
||||
else
|
||||
print_fail "Invalid cluster state (Leader found: $leader_found, Followers: $follower_count)"
|
||||
fi
|
||||
|
||||
# Check commit index sync
|
||||
if [ ${#commit_indices[@]} -eq 3 ]; then
|
||||
local first="${commit_indices[0]}"
|
||||
local all_same=true
|
||||
for idx in "${commit_indices[@]}"; do
|
||||
if [ "$idx" != "$first" ]; then
|
||||
all_same=false
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$all_same" = true ]; then
|
||||
print_pass "All nodes have synced commit index: $first"
|
||||
else
|
||||
print_warn "Commit indices differ: ${commit_indices[*]} (might be normal if writes are happening)"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
test_rqlite_replication() {
|
||||
print_header "2. RQLITE REPLICATION TEST"
|
||||
|
||||
print_test "Creating test table and inserting data on leader ($BOOTSTRAP)"
|
||||
|
||||
# Create table
|
||||
if ! response=$(curl -s --max-time 5 -XPOST "http://$BOOTSTRAP:5001/db/execute" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '[["CREATE TABLE IF NOT EXISTS test_cluster_health (id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp TEXT, node TEXT, value TEXT)"]]' 2>/dev/null); then
|
||||
print_fail "Failed to create table"
|
||||
return
|
||||
fi
|
||||
|
||||
if echo "$response" | jq -e '.results[0].error' >/dev/null 2>&1; then
|
||||
local error=$(echo "$response" | jq -r '.results[0].error')
|
||||
if [[ "$error" != "table test_cluster_health already exists" ]]; then
|
||||
print_fail "Table creation error: $error"
|
||||
return
|
||||
fi
|
||||
fi
|
||||
print_pass "Table exists"
|
||||
|
||||
# Insert test data
|
||||
local test_value="test_$(date +%s)"
|
||||
if ! response=$(curl -s --max-time 5 -XPOST "http://$BOOTSTRAP:5001/db/execute" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "[
|
||||
[\"INSERT INTO test_cluster_health (timestamp, node, value) VALUES (datetime('now'), 'bootstrap', '$test_value')\"]
|
||||
]" 2>/dev/null); then
|
||||
print_fail "Failed to insert data"
|
||||
return
|
||||
fi
|
||||
|
||||
if echo "$response" | jq -e '.results[0].error' >/dev/null 2>&1; then
|
||||
local error=$(echo "$response" | jq -r '.results[0].error')
|
||||
print_fail "Insert error: $error"
|
||||
return
|
||||
fi
|
||||
print_pass "Data inserted: $test_value"
|
||||
|
||||
# Wait for replication
|
||||
print_info "Waiting 2 seconds for replication..."
|
||||
sleep 2
|
||||
|
||||
# Query from all nodes
|
||||
for node in "${ALL_NODES[@]}"; do
|
||||
print_test "Reading from $node"
|
||||
|
||||
if ! response=$(curl -s --max-time 5 -XPOST "http://$node:5001/db/query?level=weak" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "[\"SELECT * FROM test_cluster_health WHERE value = '$test_value' LIMIT 1\"]" 2>/dev/null); then
|
||||
print_fail "Failed to query from $node"
|
||||
continue
|
||||
fi
|
||||
|
||||
if echo "$response" | jq -e '.results[0].error' >/dev/null 2>&1; then
|
||||
local error=$(echo "$response" | jq -r '.results[0].error')
|
||||
print_fail "Query error on $node: $error"
|
||||
continue
|
||||
fi
|
||||
|
||||
local row_count=$(echo "$response" | jq -r '.results[0].values | length // 0')
|
||||
if [ "$row_count" = "1" ]; then
|
||||
local retrieved_value=$(echo "$response" | jq -r '.results[0].values[0][3] // ""')
|
||||
if [ "$retrieved_value" = "$test_value" ]; then
|
||||
print_pass "Data replicated correctly to $node"
|
||||
else
|
||||
print_fail "Data mismatch on $node (got: $retrieved_value, expected: $test_value)"
|
||||
fi
|
||||
else
|
||||
print_fail "Expected 1 row from $node, got $row_count"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
test_ipfs_status() {
|
||||
print_header "3. IPFS DAEMON STATUS"
|
||||
|
||||
for node in "${ALL_NODES[@]}"; do
|
||||
print_test "Testing IPFS on $node"
|
||||
|
||||
if ! response=$(curl -s --max-time 5 -X POST http://$node:4501/api/v0/id 2>/dev/null); then
|
||||
print_fail "Cannot connect to IPFS on $node:4501"
|
||||
continue
|
||||
fi
|
||||
|
||||
local peer_id=$(echo "$response" | jq -r '.ID // "unknown"')
|
||||
local addr_count=$(echo "$response" | jq -r '.Addresses | length // 0')
|
||||
local agent=$(echo "$response" | jq -r '.AgentVersion // "unknown"')
|
||||
|
||||
if [ "$peer_id" != "unknown" ]; then
|
||||
print_pass "IPFS running on $node (ID: ${peer_id:0:12}...)"
|
||||
print_info "Agent: $agent | Addresses: $addr_count"
|
||||
else
|
||||
print_fail "IPFS not responding correctly on $node"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
test_ipfs_swarm() {
|
||||
print_header "4. IPFS SWARM CONNECTIVITY"
|
||||
|
||||
for node in "${ALL_NODES[@]}"; do
|
||||
print_test "Checking IPFS swarm peers on $node"
|
||||
|
||||
if ! response=$(curl -s --max-time 5 -X POST http://$node:4501/api/v0/swarm/peers 2>/dev/null); then
|
||||
print_fail "Failed to get swarm peers from $node"
|
||||
continue
|
||||
fi
|
||||
|
||||
local peer_count=$(echo "$response" | jq -r '.Peers | length // 0')
|
||||
|
||||
if [ "$peer_count" = "2" ]; then
|
||||
print_pass "Node $node connected to 2 IPFS peers"
|
||||
elif [ "$peer_count" -gt "0" ]; then
|
||||
print_warn "Node $node connected to $peer_count IPFS peers (expected 2)"
|
||||
else
|
||||
print_fail "Node $node has no IPFS swarm peers"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
test_ipfs_cluster_status() {
|
||||
print_header "5. IPFS CLUSTER STATUS"
|
||||
|
||||
for node in "${ALL_NODES[@]}"; do
|
||||
print_test "Testing IPFS Cluster on $node"
|
||||
|
||||
if ! response=$(curl -s --max-time 5 http://$node:9094/id 2>/dev/null); then
|
||||
print_fail "Cannot connect to IPFS Cluster on $node:9094"
|
||||
continue
|
||||
fi
|
||||
|
||||
local cluster_id=$(echo "$response" | jq -r '.id // "unknown"')
|
||||
local cluster_peers=$(echo "$response" | jq -r '.cluster_peers | length // 0')
|
||||
local version=$(echo "$response" | jq -r '.version // "unknown"')
|
||||
|
||||
if [ "$cluster_id" != "unknown" ]; then
|
||||
print_pass "IPFS Cluster running on $node (ID: ${cluster_id:0:12}...)"
|
||||
print_info "Version: $version | Cluster Peers: $cluster_peers"
|
||||
|
||||
if [ "$cluster_peers" = "3" ]; then
|
||||
print_pass "Node $node sees all 3 cluster peers"
|
||||
else
|
||||
print_warn "Node $node sees $cluster_peers cluster peers (expected 3)"
|
||||
fi
|
||||
else
|
||||
print_fail "IPFS Cluster not responding correctly on $node"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
test_ipfs_cluster_pins() {
|
||||
print_header "6. IPFS CLUSTER PIN CONSISTENCY"
|
||||
|
||||
local pin_counts=()
|
||||
|
||||
for node in "${ALL_NODES[@]}"; do
|
||||
print_test "Checking pins on $node"
|
||||
|
||||
if ! response=$(curl -s --max-time 5 http://$node:9094/pins 2>/dev/null); then
|
||||
print_fail "Failed to get pins from $node"
|
||||
pin_counts+=(0)
|
||||
continue
|
||||
fi
|
||||
|
||||
local pin_count=$(echo "$response" | jq -r 'length // 0')
|
||||
pin_counts+=($pin_count)
|
||||
print_pass "Node $node has $pin_count pins"
|
||||
done
|
||||
|
||||
# Check if all nodes have same pin count
|
||||
if [ ${#pin_counts[@]} -eq 3 ]; then
|
||||
local first="${pin_counts[0]}"
|
||||
local all_same=true
|
||||
for count in "${pin_counts[@]}"; do
|
||||
if [ "$count" != "$first" ]; then
|
||||
all_same=false
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$all_same" = true ]; then
|
||||
print_pass "All nodes have consistent pin count: $first"
|
||||
else
|
||||
print_warn "Pin counts differ: ${pin_counts[*]} (might be syncing)"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
print_summary() {
|
||||
print_header "TEST SUMMARY"
|
||||
|
||||
echo ""
|
||||
echo -e "${GREEN}Passed: $PASSED${NC}"
|
||||
echo -e "${YELLOW}Warnings: $WARNINGS${NC}"
|
||||
echo -e "${RED}Failed: $FAILED${NC}"
|
||||
echo ""
|
||||
|
||||
if [ $FAILED -eq 0 ]; then
|
||||
echo -e "${GREEN}🎉 All critical tests passed! Cluster is healthy.${NC}"
|
||||
exit 0
|
||||
elif [ $FAILED -le 2 ]; then
|
||||
echo -e "${YELLOW}⚠️ Some tests failed. Review the output above.${NC}"
|
||||
exit 1
|
||||
else
|
||||
echo -e "${RED}❌ Multiple failures detected. Cluster needs attention.${NC}"
|
||||
exit 2
|
||||
fi
|
||||
}
|
||||
|
||||
# Main execution
|
||||
main() {
|
||||
echo ""
|
||||
echo -e "${BLUE}╔════════════════════════════════════════════╗${NC}"
|
||||
echo -e "${BLUE}║ DEBROS Production Cluster Health Check ║${NC}"
|
||||
echo -e "${BLUE}╚════════════════════════════════════════════╝${NC}"
|
||||
echo ""
|
||||
echo "Testing cluster:"
|
||||
echo " Bootstrap: $BOOTSTRAP"
|
||||
echo " Node 1: $NODE1"
|
||||
echo " Node 2: $NODE2"
|
||||
|
||||
test_rqlite_status
|
||||
test_rqlite_replication
|
||||
test_ipfs_status
|
||||
test_ipfs_swarm
|
||||
test_ipfs_cluster_status
|
||||
test_ipfs_cluster_pins
|
||||
print_summary
|
||||
}
|
||||
|
||||
# Run main
|
||||
main
|
||||
|
||||
@ -1 +0,0 @@
|
||||
agreed
|
||||
Loading…
x
Reference in New Issue
Block a user