Created inspector , it is hot as fuck

This commit is contained in:
anonpenguin23 2026-02-11 09:53:46 +02:00
parent eddf0553b7
commit 7dc6fecac2
38 changed files with 6570 additions and 2164 deletions

5
.gitignore vendored
View File

@ -102,4 +102,7 @@ bin-linux/
website/ website/
terms-agreement terms-agreement
cli
./inspector

View File

@ -88,6 +88,10 @@ func main() {
case "db": case "db":
cli.HandleDBCommand(args) cli.HandleDBCommand(args)
// Cluster inspection
case "inspect":
cli.HandleInspectCommand(args)
// Namespace management // Namespace management
case "namespace": case "namespace":
cli.HandleNamespaceCommand(args) cli.HandleNamespaceCommand(args)
@ -173,6 +177,12 @@ func showHelp() {
fmt.Printf("🏢 Namespaces:\n") fmt.Printf("🏢 Namespaces:\n")
fmt.Printf(" namespace delete - Delete current namespace and all resources\n\n") fmt.Printf(" namespace delete - Delete current namespace and all resources\n\n")
fmt.Printf("🔍 Cluster Inspection:\n")
fmt.Printf(" inspect - Inspect cluster health via SSH\n")
fmt.Printf(" inspect --env devnet - Inspect devnet nodes\n")
fmt.Printf(" inspect --subsystem rqlite - Inspect only RQLite subsystem\n")
fmt.Printf(" inspect --format json - Output as JSON\n\n")
fmt.Printf("🌍 Environments:\n") fmt.Printf("🌍 Environments:\n")
fmt.Printf(" env list - List all environments\n") fmt.Printf(" env list - List all environments\n")
fmt.Printf(" env current - Show current environment\n") fmt.Printf(" env current - Show current environment\n")

11
cmd/inspector/main.go Normal file
View File

@ -0,0 +1,11 @@
package main
import (
"os"
"github.com/DeBrosOfficial/network/pkg/cli"
)
func main() {
cli.HandleInspectCommand(os.Args[1:])
}

View File

@ -1,415 +0,0 @@
//go:build e2e
package cluster_test
import (
"bytes"
"context"
"fmt"
"io"
"testing"
"time"
"github.com/DeBrosOfficial/network/e2e"
"github.com/DeBrosOfficial/network/pkg/ipfs"
)
// Note: These tests connect directly to IPFS Cluster API (localhost:9094)
// and IPFS API (localhost:4501). They are for local development only.
// For production testing, use storage_http_test.go which uses gateway endpoints.
func TestIPFSCluster_Health(t *testing.T) {
e2e.SkipIfProduction(t) // Direct IPFS connection not available in production
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
logger := e2e.NewTestLogger(t)
cfg := ipfs.Config{
ClusterAPIURL: e2e.GetIPFSClusterURL(),
Timeout: 10 * time.Second,
}
client, err := ipfs.NewClient(cfg, logger)
if err != nil {
t.Fatalf("failed to create IPFS client: %v", err)
}
err = client.Health(ctx)
if err != nil {
t.Fatalf("health check failed: %v", err)
}
}
func TestIPFSCluster_GetPeerCount(t *testing.T) {
e2e.SkipIfProduction(t)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
logger := e2e.NewTestLogger(t)
cfg := ipfs.Config{
ClusterAPIURL: e2e.GetIPFSClusterURL(),
Timeout: 10 * time.Second,
}
client, err := ipfs.NewClient(cfg, logger)
if err != nil {
t.Fatalf("failed to create IPFS client: %v", err)
}
peerCount, err := client.GetPeerCount(ctx)
if err != nil {
t.Fatalf("get peer count failed: %v", err)
}
if peerCount < 0 {
t.Fatalf("expected non-negative peer count, got %d", peerCount)
}
t.Logf("IPFS cluster peers: %d", peerCount)
}
func TestIPFSCluster_AddFile(t *testing.T) {
e2e.SkipIfProduction(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
logger := e2e.NewTestLogger(t)
cfg := ipfs.Config{
ClusterAPIURL: e2e.GetIPFSClusterURL(),
Timeout: 30 * time.Second,
}
client, err := ipfs.NewClient(cfg, logger)
if err != nil {
t.Fatalf("failed to create IPFS client: %v", err)
}
content := []byte("IPFS cluster test content")
result, err := client.Add(ctx, bytes.NewReader(content), "test.txt")
if err != nil {
t.Fatalf("add file failed: %v", err)
}
if result.Cid == "" {
t.Fatalf("expected non-empty CID")
}
if result.Size != int64(len(content)) {
t.Fatalf("expected size %d, got %d", len(content), result.Size)
}
t.Logf("Added file with CID: %s", result.Cid)
}
func TestIPFSCluster_PinFile(t *testing.T) {
e2e.SkipIfProduction(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
logger := e2e.NewTestLogger(t)
cfg := ipfs.Config{
ClusterAPIURL: e2e.GetIPFSClusterURL(),
Timeout: 30 * time.Second,
}
client, err := ipfs.NewClient(cfg, logger)
if err != nil {
t.Fatalf("failed to create IPFS client: %v", err)
}
// Add file first
content := []byte("IPFS pin test content")
addResult, err := client.Add(ctx, bytes.NewReader(content), "pin-test.txt")
if err != nil {
t.Fatalf("add file failed: %v", err)
}
cid := addResult.Cid
// Pin the file
pinResult, err := client.Pin(ctx, cid, "pinned-file", 1)
if err != nil {
t.Fatalf("pin file failed: %v", err)
}
if pinResult.Cid != cid {
t.Fatalf("expected cid %s, got %s", cid, pinResult.Cid)
}
t.Logf("Pinned file: %s", cid)
}
func TestIPFSCluster_PinStatus(t *testing.T) {
e2e.SkipIfProduction(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
logger := e2e.NewTestLogger(t)
cfg := ipfs.Config{
ClusterAPIURL: e2e.GetIPFSClusterURL(),
Timeout: 30 * time.Second,
}
client, err := ipfs.NewClient(cfg, logger)
if err != nil {
t.Fatalf("failed to create IPFS client: %v", err)
}
// Add and pin file
content := []byte("IPFS status test content")
addResult, err := client.Add(ctx, bytes.NewReader(content), "status-test.txt")
if err != nil {
t.Fatalf("add file failed: %v", err)
}
cid := addResult.Cid
pinResult, err := client.Pin(ctx, cid, "status-test", 1)
if err != nil {
t.Fatalf("pin file failed: %v", err)
}
if pinResult.Cid != cid {
t.Fatalf("expected cid %s, got %s", cid, pinResult.Cid)
}
// Give pin time to propagate
e2e.Delay(1000)
// Get status
status, err := client.PinStatus(ctx, cid)
if err != nil {
t.Fatalf("get pin status failed: %v", err)
}
if status.Cid != cid {
t.Fatalf("expected cid %s, got %s", cid, status.Cid)
}
if status.Name != "status-test" {
t.Fatalf("expected name 'status-test', got %s", status.Name)
}
if status.ReplicationFactor < 1 {
t.Logf("warning: replication factor is %d, expected >= 1", status.ReplicationFactor)
}
t.Logf("Pin status: %s (replication: %d, peers: %d)", status.Status, status.ReplicationFactor, len(status.Peers))
}
func TestIPFSCluster_UnpinFile(t *testing.T) {
e2e.SkipIfProduction(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
logger := e2e.NewTestLogger(t)
cfg := ipfs.Config{
ClusterAPIURL: e2e.GetIPFSClusterURL(),
Timeout: 30 * time.Second,
}
client, err := ipfs.NewClient(cfg, logger)
if err != nil {
t.Fatalf("failed to create IPFS client: %v", err)
}
// Add and pin file
content := []byte("IPFS unpin test content")
addResult, err := client.Add(ctx, bytes.NewReader(content), "unpin-test.txt")
if err != nil {
t.Fatalf("add file failed: %v", err)
}
cid := addResult.Cid
_, err = client.Pin(ctx, cid, "unpin-test", 1)
if err != nil {
t.Fatalf("pin file failed: %v", err)
}
// Unpin file
err = client.Unpin(ctx, cid)
if err != nil {
t.Fatalf("unpin file failed: %v", err)
}
t.Logf("Unpinned file: %s", cid)
}
func TestIPFSCluster_GetFile(t *testing.T) {
e2e.SkipIfProduction(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
logger := e2e.NewTestLogger(t)
cfg := ipfs.Config{
ClusterAPIURL: e2e.GetIPFSClusterURL(),
Timeout: 30 * time.Second,
}
client, err := ipfs.NewClient(cfg, logger)
if err != nil {
t.Fatalf("failed to create IPFS client: %v", err)
}
// Add file
content := []byte("IPFS get test content")
addResult, err := client.Add(ctx, bytes.NewReader(content), "get-test.txt")
if err != nil {
t.Fatalf("add file failed: %v", err)
}
cid := addResult.Cid
// Give time for propagation
e2e.Delay(1000)
// Get file
rc, err := client.Get(ctx, cid, e2e.GetIPFSAPIURL())
if err != nil {
t.Fatalf("get file failed: %v", err)
}
defer rc.Close()
retrievedContent, err := io.ReadAll(rc)
if err != nil {
t.Fatalf("failed to read content: %v", err)
}
if !bytes.Equal(retrievedContent, content) {
t.Fatalf("content mismatch: expected %q, got %q", string(content), string(retrievedContent))
}
t.Logf("Retrieved file: %s (%d bytes)", cid, len(retrievedContent))
}
func TestIPFSCluster_LargeFile(t *testing.T) {
e2e.SkipIfProduction(t)
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
logger := e2e.NewTestLogger(t)
cfg := ipfs.Config{
ClusterAPIURL: e2e.GetIPFSClusterURL(),
Timeout: 60 * time.Second,
}
client, err := ipfs.NewClient(cfg, logger)
if err != nil {
t.Fatalf("failed to create IPFS client: %v", err)
}
// Create 5MB file
content := bytes.Repeat([]byte("x"), 5*1024*1024)
result, err := client.Add(ctx, bytes.NewReader(content), "large.bin")
if err != nil {
t.Fatalf("add large file failed: %v", err)
}
if result.Cid == "" {
t.Fatalf("expected non-empty CID")
}
if result.Size != int64(len(content)) {
t.Fatalf("expected size %d, got %d", len(content), result.Size)
}
t.Logf("Added large file with CID: %s (%d bytes)", result.Cid, result.Size)
}
func TestIPFSCluster_ReplicationFactor(t *testing.T) {
e2e.SkipIfProduction(t) // Direct IPFS connection not available in production
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
logger := e2e.NewTestLogger(t)
cfg := ipfs.Config{
ClusterAPIURL: e2e.GetIPFSClusterURL(),
Timeout: 30 * time.Second,
}
client, err := ipfs.NewClient(cfg, logger)
if err != nil {
t.Fatalf("failed to create IPFS client: %v", err)
}
// Add file
content := []byte("IPFS replication test content")
addResult, err := client.Add(ctx, bytes.NewReader(content), "replication-test.txt")
if err != nil {
t.Fatalf("add file failed: %v", err)
}
cid := addResult.Cid
// Pin with specific replication factor
replicationFactor := 2
pinResult, err := client.Pin(ctx, cid, "replication-test", replicationFactor)
if err != nil {
t.Fatalf("pin file failed: %v", err)
}
if pinResult.Cid != cid {
t.Fatalf("expected cid %s, got %s", cid, pinResult.Cid)
}
// Give time for replication
e2e.Delay(2000)
// Check status
status, err := client.PinStatus(ctx, cid)
if err != nil {
t.Fatalf("get pin status failed: %v", err)
}
t.Logf("Replication factor: requested=%d, actual=%d, peers=%d", replicationFactor, status.ReplicationFactor, len(status.Peers))
}
func TestIPFSCluster_MultipleFiles(t *testing.T) {
e2e.SkipIfProduction(t) // Direct IPFS connection not available in production
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
logger := e2e.NewTestLogger(t)
cfg := ipfs.Config{
ClusterAPIURL: e2e.GetIPFSClusterURL(),
Timeout: 30 * time.Second,
}
client, err := ipfs.NewClient(cfg, logger)
if err != nil {
t.Fatalf("failed to create IPFS client: %v", err)
}
// Add multiple files
numFiles := 5
var cids []string
for i := 0; i < numFiles; i++ {
content := []byte(fmt.Sprintf("File %d", i))
result, err := client.Add(ctx, bytes.NewReader(content), fmt.Sprintf("file%d.txt", i))
if err != nil {
t.Fatalf("add file %d failed: %v", i, err)
}
cids = append(cids, result.Cid)
}
if len(cids) != numFiles {
t.Fatalf("expected %d files added, got %d", numFiles, len(cids))
}
// Verify all files exist
for i, cid := range cids {
status, err := client.PinStatus(ctx, cid)
if err != nil {
t.Logf("warning: failed to get status for file %d: %v", i, err)
continue
}
if status.Cid != cid {
t.Fatalf("expected cid %s, got %s", cid, status.Cid)
}
}
t.Logf("Successfully added and verified %d files", numFiles)
}

View File

@ -1,296 +0,0 @@
//go:build e2e
package cluster_test
import (
"context"
"net/http"
"strings"
"testing"
"time"
"github.com/DeBrosOfficial/network/e2e"
)
func TestLibP2P_PeerConnectivity(t *testing.T) {
e2e.SkipIfMissingGateway(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
// Create and connect client
c := e2e.NewNetworkClient(t)
if err := c.Connect(); err != nil {
t.Fatalf("connect failed: %v", err)
}
defer c.Disconnect()
// Verify peer connectivity through the gateway
req := &e2e.HTTPRequest{
Method: http.MethodGet,
URL: e2e.GetGatewayURL() + "/v1/network/peers",
}
body, status, err := req.Do(ctx)
if err != nil {
t.Fatalf("peers request failed: %v", err)
}
if status != http.StatusOK {
t.Fatalf("expected status 200, got %d", status)
}
var resp map[string]interface{}
if err := e2e.DecodeJSON(body, &resp); err != nil {
t.Fatalf("failed to decode response: %v", err)
}
peers := resp["peers"].([]interface{})
if len(peers) == 0 {
t.Logf("warning: no peers connected (cluster may still be initializing)")
}
}
func TestLibP2P_BootstrapPeers(t *testing.T) {
e2e.SkipIfMissingGateway(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
bootstrapPeers := e2e.GetBootstrapPeers()
if len(bootstrapPeers) == 0 {
t.Skipf("E2E_BOOTSTRAP_PEERS not set; skipping")
}
// Create client with bootstrap peers explicitly set
c := e2e.NewNetworkClient(t)
if err := c.Connect(); err != nil {
t.Fatalf("connect failed: %v", err)
}
defer c.Disconnect()
// Give peer discovery time
e2e.Delay(2000)
// Verify we're connected (check via gateway status)
req := &e2e.HTTPRequest{
Method: http.MethodGet,
URL: e2e.GetGatewayURL() + "/v1/network/status",
}
body, status, err := req.Do(ctx)
if err != nil {
t.Fatalf("status request failed: %v", err)
}
if status != http.StatusOK {
t.Fatalf("expected status 200, got %d", status)
}
var resp map[string]interface{}
if err := e2e.DecodeJSON(body, &resp); err != nil {
t.Fatalf("failed to decode response: %v", err)
}
if resp["connected"] != true {
t.Logf("warning: client not connected to network (cluster may still be initializing)")
}
}
func TestLibP2P_MultipleClientConnections(t *testing.T) {
e2e.SkipIfMissingGateway(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
// Create multiple clients
c1 := e2e.NewNetworkClient(t)
c2 := e2e.NewNetworkClient(t)
c3 := e2e.NewNetworkClient(t)
if err := c1.Connect(); err != nil {
t.Fatalf("c1 connect failed: %v", err)
}
defer c1.Disconnect()
if err := c2.Connect(); err != nil {
t.Fatalf("c2 connect failed: %v", err)
}
defer c2.Disconnect()
if err := c3.Connect(); err != nil {
t.Fatalf("c3 connect failed: %v", err)
}
defer c3.Disconnect()
// Give peer discovery time
e2e.Delay(2000)
// Verify gateway sees multiple peers
req := &e2e.HTTPRequest{
Method: http.MethodGet,
URL: e2e.GetGatewayURL() + "/v1/network/peers",
}
body, status, err := req.Do(ctx)
if err != nil {
t.Fatalf("peers request failed: %v", err)
}
if status != http.StatusOK {
t.Fatalf("expected status 200, got %d", status)
}
var resp map[string]interface{}
if err := e2e.DecodeJSON(body, &resp); err != nil {
t.Fatalf("failed to decode response: %v", err)
}
peers := resp["peers"].([]interface{})
if len(peers) < 1 {
t.Logf("warning: expected at least 1 peer, got %d", len(peers))
}
}
func TestLibP2P_ReconnectAfterDisconnect(t *testing.T) {
e2e.SkipIfMissingGateway(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
c := e2e.NewNetworkClient(t)
// Connect
if err := c.Connect(); err != nil {
t.Fatalf("connect failed: %v", err)
}
// Verify connected via gateway
req1 := &e2e.HTTPRequest{
Method: http.MethodGet,
URL: e2e.GetGatewayURL() + "/v1/network/status",
}
_, status1, err := req1.Do(ctx)
if err != nil || status1 != http.StatusOK {
t.Logf("warning: gateway check failed before disconnect: status %d, err %v", status1, err)
}
// Disconnect
if err := c.Disconnect(); err != nil {
t.Logf("warning: disconnect failed: %v", err)
}
// Give time for disconnect to propagate
e2e.Delay(500)
// Reconnect
if err := c.Connect(); err != nil {
t.Fatalf("reconnect failed: %v", err)
}
defer c.Disconnect()
// Verify connected via gateway again
req2 := &e2e.HTTPRequest{
Method: http.MethodGet,
URL: e2e.GetGatewayURL() + "/v1/network/status",
}
_, status2, err := req2.Do(ctx)
if err != nil || status2 != http.StatusOK {
t.Logf("warning: gateway check failed after reconnect: status %d, err %v", status2, err)
}
}
func TestLibP2P_PeerDiscovery(t *testing.T) {
e2e.SkipIfMissingGateway(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
// Create client
c := e2e.NewNetworkClient(t)
if err := c.Connect(); err != nil {
t.Fatalf("connect failed: %v", err)
}
defer c.Disconnect()
// Give peer discovery time
e2e.Delay(3000)
// Get peer list
req := &e2e.HTTPRequest{
Method: http.MethodGet,
URL: e2e.GetGatewayURL() + "/v1/network/peers",
}
body, status, err := req.Do(ctx)
if err != nil {
t.Fatalf("peers request failed: %v", err)
}
if status != http.StatusOK {
t.Fatalf("expected status 200, got %d", status)
}
var resp map[string]interface{}
if err := e2e.DecodeJSON(body, &resp); err != nil {
t.Fatalf("failed to decode response: %v", err)
}
peers := resp["peers"].([]interface{})
if len(peers) == 0 {
t.Logf("warning: no peers discovered (cluster may not have multiple nodes)")
} else {
// Verify peer format (should be multiaddr strings)
for _, p := range peers {
peerStr := p.(string)
if !strings.Contains(peerStr, "/p2p/") && !strings.Contains(peerStr, "/ipfs/") {
t.Logf("warning: unexpected peer format: %s", peerStr)
}
}
}
}
func TestLibP2P_PeerAddressFormat(t *testing.T) {
e2e.SkipIfMissingGateway(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
// Create client
c := e2e.NewNetworkClient(t)
if err := c.Connect(); err != nil {
t.Fatalf("connect failed: %v", err)
}
defer c.Disconnect()
// Get peer list
req := &e2e.HTTPRequest{
Method: http.MethodGet,
URL: e2e.GetGatewayURL() + "/v1/network/peers",
}
body, status, err := req.Do(ctx)
if err != nil {
t.Fatalf("peers request failed: %v", err)
}
if status != http.StatusOK {
t.Fatalf("expected status 200, got %d", status)
}
var resp map[string]interface{}
if err := e2e.DecodeJSON(body, &resp); err != nil {
t.Fatalf("failed to decode response: %v", err)
}
peers := resp["peers"].([]interface{})
for _, p := range peers {
peerStr := p.(string)
// Multiaddrs should start with /
if !strings.HasPrefix(peerStr, "/") {
t.Fatalf("expected multiaddr format, got %s", peerStr)
}
}
}

View File

@ -1,338 +0,0 @@
//go:build e2e
package cluster_test
import (
"encoding/json"
"fmt"
"net"
"net/http"
"strings"
"testing"
"time"
"github.com/DeBrosOfficial/network/e2e"
"github.com/stretchr/testify/require"
)
// =============================================================================
// STRICT OLRIC CACHE DISTRIBUTION TESTS
// These tests verify that Olric cache data is properly distributed across nodes.
// Tests FAIL if distribution doesn't work - no skips, no warnings.
// =============================================================================
// getOlricNodeAddresses returns HTTP addresses of Olric nodes
// Note: Olric HTTP port is typically on port 3320 for the main cluster
func getOlricNodeAddresses() []string {
// In dev mode, we have a single Olric instance
// In production, each node runs its own Olric instance
return []string{
"http://localhost:3320",
}
}
// TestOlric_BasicDistribution verifies cache operations work across the cluster.
func TestOlric_BasicDistribution(t *testing.T) {
// Note: Not using SkipIfMissingGateway() since LoadTestEnv() creates its own API key
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "FAIL: Could not load test environment")
require.NotEmpty(t, env.APIKey, "FAIL: No API key available")
dmap := fmt.Sprintf("dist_test_%d", time.Now().UnixNano())
t.Run("Put_and_get_from_same_gateway", func(t *testing.T) {
key := fmt.Sprintf("key_%d", time.Now().UnixNano())
value := fmt.Sprintf("value_%d", time.Now().UnixNano())
// Put
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
require.NoError(t, err, "FAIL: Could not put value to cache")
// Get
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
require.NoError(t, err, "FAIL: Could not get value from cache")
require.Equal(t, value, retrieved, "FAIL: Retrieved value doesn't match")
t.Logf(" ✓ Put/Get works: %s = %s", key, value)
})
t.Run("Multiple_keys_distributed", func(t *testing.T) {
// Put multiple keys (should be distributed across partitions)
keys := make(map[string]string)
for i := 0; i < 20; i++ {
key := fmt.Sprintf("dist_key_%d_%d", i, time.Now().UnixNano())
value := fmt.Sprintf("dist_value_%d", i)
keys[key] = value
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
require.NoError(t, err, "FAIL: Could not put key %s", key)
}
t.Logf(" Put 20 keys to cache")
// Verify all keys are retrievable
for key, expectedValue := range keys {
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
require.NoError(t, err, "FAIL: Could not get key %s", key)
require.Equal(t, expectedValue, retrieved, "FAIL: Value mismatch for key %s", key)
}
t.Logf(" ✓ All 20 keys are retrievable")
})
}
// TestOlric_ConcurrentAccess verifies cache handles concurrent operations correctly.
func TestOlric_ConcurrentAccess(t *testing.T) {
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "FAIL: Could not load test environment")
dmap := fmt.Sprintf("concurrent_test_%d", time.Now().UnixNano())
t.Run("Concurrent_writes_to_same_key", func(t *testing.T) {
key := fmt.Sprintf("concurrent_key_%d", time.Now().UnixNano())
// Launch multiple goroutines writing to the same key
done := make(chan error, 10)
for i := 0; i < 10; i++ {
go func(idx int) {
value := fmt.Sprintf("concurrent_value_%d", idx)
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
done <- err
}(i)
}
// Wait for all writes
var errors []error
for i := 0; i < 10; i++ {
if err := <-done; err != nil {
errors = append(errors, err)
}
}
require.Empty(t, errors, "FAIL: %d concurrent writes failed: %v", len(errors), errors)
// The key should have ONE of the values (last write wins)
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
require.NoError(t, err, "FAIL: Could not get key after concurrent writes")
require.Contains(t, retrieved, "concurrent_value_", "FAIL: Value doesn't match expected pattern")
t.Logf(" ✓ Concurrent writes succeeded, final value: %s", retrieved)
})
t.Run("Concurrent_reads_and_writes", func(t *testing.T) {
key := fmt.Sprintf("rw_key_%d", time.Now().UnixNano())
initialValue := "initial_value"
// Set initial value
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, initialValue)
require.NoError(t, err, "FAIL: Could not set initial value")
// Launch concurrent readers and writers
done := make(chan error, 20)
// 10 readers
for i := 0; i < 10; i++ {
go func() {
_, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
done <- err
}()
}
// 10 writers
for i := 0; i < 10; i++ {
go func(idx int) {
value := fmt.Sprintf("updated_value_%d", idx)
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
done <- err
}(i)
}
// Wait for all operations
var readErrors, writeErrors []error
for i := 0; i < 20; i++ {
if err := <-done; err != nil {
if i < 10 {
readErrors = append(readErrors, err)
} else {
writeErrors = append(writeErrors, err)
}
}
}
require.Empty(t, readErrors, "FAIL: %d reads failed", len(readErrors))
require.Empty(t, writeErrors, "FAIL: %d writes failed", len(writeErrors))
t.Logf(" ✓ Concurrent read/write operations succeeded")
})
}
// TestOlric_NamespaceClusterCache verifies cache works in namespace-specific clusters.
func TestOlric_NamespaceClusterCache(t *testing.T) {
// Create a new namespace
namespace := fmt.Sprintf("cache-test-%d", time.Now().UnixNano())
env, err := e2e.LoadTestEnvWithNamespace(namespace)
require.NoError(t, err, "FAIL: Could not create namespace for cache test")
require.NotEmpty(t, env.APIKey, "FAIL: No API key")
t.Logf("Created namespace %s", namespace)
dmap := fmt.Sprintf("ns_cache_%d", time.Now().UnixNano())
t.Run("Cache_operations_work_in_namespace", func(t *testing.T) {
key := fmt.Sprintf("ns_key_%d", time.Now().UnixNano())
value := fmt.Sprintf("ns_value_%d", time.Now().UnixNano())
// Put using namespace API key
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
require.NoError(t, err, "FAIL: Could not put value in namespace cache")
// Get
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
require.NoError(t, err, "FAIL: Could not get value from namespace cache")
require.Equal(t, value, retrieved, "FAIL: Value mismatch in namespace cache")
t.Logf(" ✓ Namespace cache operations work: %s = %s", key, value)
})
// Check if namespace Olric instances are running (port 10003 offset in port blocks)
var nsOlricPorts []int
for port := 10003; port <= 10098; port += 5 {
conn, err := net.DialTimeout("tcp", fmt.Sprintf("localhost:%d", port), 1*time.Second)
if err == nil {
conn.Close()
nsOlricPorts = append(nsOlricPorts, port)
}
}
if len(nsOlricPorts) > 0 {
t.Logf("Found %d namespace Olric memberlist ports: %v", len(nsOlricPorts), nsOlricPorts)
t.Run("Namespace_Olric_nodes_connected", func(t *testing.T) {
// Verify all namespace Olric nodes can be reached
for _, port := range nsOlricPorts {
conn, err := net.DialTimeout("tcp", fmt.Sprintf("localhost:%d", port), 2*time.Second)
require.NoError(t, err, "FAIL: Cannot connect to namespace Olric on port %d", port)
conn.Close()
t.Logf(" ✓ Namespace Olric memberlist on port %d is reachable", port)
}
})
}
}
// TestOlric_DataConsistency verifies data remains consistent across operations.
func TestOlric_DataConsistency(t *testing.T) {
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "FAIL: Could not load test environment")
dmap := fmt.Sprintf("consistency_test_%d", time.Now().UnixNano())
t.Run("Update_preserves_latest_value", func(t *testing.T) {
key := fmt.Sprintf("update_key_%d", time.Now().UnixNano())
// Write multiple times
for i := 1; i <= 5; i++ {
value := fmt.Sprintf("version_%d", i)
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
require.NoError(t, err, "FAIL: Could not update key to version %d", i)
}
// Final read should return latest version
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
require.NoError(t, err, "FAIL: Could not read final value")
require.Equal(t, "version_5", retrieved, "FAIL: Latest version not preserved")
t.Logf(" ✓ Latest value preserved after 5 updates")
})
t.Run("Delete_removes_key", func(t *testing.T) {
key := fmt.Sprintf("delete_key_%d", time.Now().UnixNano())
value := "to_be_deleted"
// Put
err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value)
require.NoError(t, err, "FAIL: Could not put value")
// Verify it exists
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
require.NoError(t, err, "FAIL: Could not get value before delete")
require.Equal(t, value, retrieved)
// Delete (POST with JSON body)
deleteBody := map[string]interface{}{
"dmap": dmap,
"key": key,
}
deleteBytes, _ := json.Marshal(deleteBody)
req, _ := http.NewRequest("POST", env.GatewayURL+"/v1/cache/delete", strings.NewReader(string(deleteBytes)))
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+env.APIKey)
client := &http.Client{Timeout: 10 * time.Second}
resp, err := client.Do(req)
require.NoError(t, err, "FAIL: Delete request failed")
resp.Body.Close()
require.True(t, resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusNoContent,
"FAIL: Delete returned unexpected status %d", resp.StatusCode)
// Verify key is gone
_, err = e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
require.Error(t, err, "FAIL: Key should not exist after delete")
require.Contains(t, err.Error(), "not found", "FAIL: Expected 'not found' error")
t.Logf(" ✓ Delete properly removes key")
})
}
// TestOlric_TTLExpiration verifies TTL expiration works.
// NOTE: TTL is currently parsed but not applied by the cache handler (TODO in set_handler.go).
// This test is skipped until TTL support is fully implemented.
func TestOlric_TTLExpiration(t *testing.T) {
t.Skip("TTL support not yet implemented in cache handler - see set_handler.go lines 88-98")
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "FAIL: Could not load test environment")
dmap := fmt.Sprintf("ttl_test_%d", time.Now().UnixNano())
t.Run("Key_expires_after_TTL", func(t *testing.T) {
key := fmt.Sprintf("ttl_key_%d", time.Now().UnixNano())
value := "expires_soon"
ttlSeconds := 3
// Put with TTL (TTL is a duration string like "3s", "1m", etc.)
reqBody := map[string]interface{}{
"dmap": dmap,
"key": key,
"value": value,
"ttl": fmt.Sprintf("%ds", ttlSeconds),
}
bodyBytes, _ := json.Marshal(reqBody)
req, _ := http.NewRequest("POST", env.GatewayURL+"/v1/cache/put", strings.NewReader(string(bodyBytes)))
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+env.APIKey)
client := &http.Client{Timeout: 10 * time.Second}
resp, err := client.Do(req)
require.NoError(t, err, "FAIL: Put with TTL failed")
resp.Body.Close()
require.True(t, resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusCreated,
"FAIL: Put returned status %d", resp.StatusCode)
// Verify key exists immediately
retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
require.NoError(t, err, "FAIL: Could not get key immediately after put")
require.Equal(t, value, retrieved)
t.Logf(" Key exists immediately after put")
// Wait for TTL to expire (plus buffer)
time.Sleep(time.Duration(ttlSeconds+2) * time.Second)
// Key should be gone
_, err = e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key)
require.Error(t, err, "FAIL: Key should have expired after %d seconds", ttlSeconds)
require.Contains(t, err.Error(), "not found", "FAIL: Expected 'not found' error after TTL")
t.Logf(" ✓ Key expired after %d seconds as expected", ttlSeconds)
})
}

View File

@ -1,479 +0,0 @@
//go:build e2e
package cluster_test
import (
"context"
"fmt"
"net/http"
"sync"
"testing"
"time"
"github.com/DeBrosOfficial/network/e2e"
"github.com/stretchr/testify/require"
)
// =============================================================================
// STRICT RQLITE CLUSTER TESTS
// These tests verify that RQLite cluster operations work correctly.
// Tests FAIL if operations don't work - no skips, no warnings.
// =============================================================================
// TestRQLite_ClusterHealth verifies the RQLite cluster is healthy and operational.
func TestRQLite_ClusterHealth(t *testing.T) {
e2e.SkipIfMissingGateway(t)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
// Check RQLite schema endpoint (proves cluster is reachable)
req := &e2e.HTTPRequest{
Method: http.MethodGet,
URL: e2e.GetGatewayURL() + "/v1/rqlite/schema",
}
body, status, err := req.Do(ctx)
require.NoError(t, err, "FAIL: Could not reach RQLite cluster")
require.Equal(t, http.StatusOK, status, "FAIL: RQLite schema endpoint returned %d: %s", status, string(body))
var schemaResp map[string]interface{}
err = e2e.DecodeJSON(body, &schemaResp)
require.NoError(t, err, "FAIL: Could not decode RQLite schema response")
// Schema endpoint should return tables array
_, hasTables := schemaResp["tables"]
require.True(t, hasTables, "FAIL: RQLite schema response missing 'tables' field")
t.Logf(" ✓ RQLite cluster is healthy and responding")
}
// TestRQLite_WriteReadConsistency verifies data written can be read back consistently.
func TestRQLite_WriteReadConsistency(t *testing.T) {
e2e.SkipIfMissingGateway(t)
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()
table := e2e.GenerateTableName()
// Cleanup
defer func() {
dropReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/drop-table",
Body: map[string]interface{}{"table": table},
}
dropReq.Do(context.Background())
}()
// Create table
createReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/create-table",
Body: map[string]interface{}{
"schema": fmt.Sprintf(
"CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY, value TEXT, created_at DATETIME DEFAULT CURRENT_TIMESTAMP)",
table,
),
},
}
_, status, err := createReq.Do(ctx)
require.NoError(t, err, "FAIL: Create table request failed")
require.True(t, status == http.StatusCreated || status == http.StatusOK,
"FAIL: Create table returned status %d", status)
t.Logf("Created table %s", table)
t.Run("Write_then_read_returns_same_data", func(t *testing.T) {
uniqueValue := fmt.Sprintf("test_value_%d", time.Now().UnixNano())
// Insert
insertReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/transaction",
Body: map[string]interface{}{
"statements": []string{
fmt.Sprintf("INSERT INTO %s (value) VALUES ('%s')", table, uniqueValue),
},
},
}
_, status, err := insertReq.Do(ctx)
require.NoError(t, err, "FAIL: Insert request failed")
require.Equal(t, http.StatusOK, status, "FAIL: Insert returned status %d", status)
// Read back
queryReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/query",
Body: map[string]interface{}{
"sql": fmt.Sprintf("SELECT value FROM %s WHERE value = '%s'", table, uniqueValue),
},
}
body, status, err := queryReq.Do(ctx)
require.NoError(t, err, "FAIL: Query request failed")
require.Equal(t, http.StatusOK, status, "FAIL: Query returned status %d", status)
var queryResp map[string]interface{}
err = e2e.DecodeJSON(body, &queryResp)
require.NoError(t, err, "FAIL: Could not decode query response")
// Verify we got our value back
count, ok := queryResp["count"].(float64)
require.True(t, ok, "FAIL: Response missing 'count' field")
require.Equal(t, float64(1), count, "FAIL: Expected 1 row, got %v", count)
t.Logf(" ✓ Written value '%s' was read back correctly", uniqueValue)
})
t.Run("Multiple_writes_all_readable", func(t *testing.T) {
// Insert multiple values
var statements []string
for i := 0; i < 10; i++ {
statements = append(statements,
fmt.Sprintf("INSERT INTO %s (value) VALUES ('batch_%d')", table, i))
}
insertReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/transaction",
Body: map[string]interface{}{
"statements": statements,
},
}
_, status, err := insertReq.Do(ctx)
require.NoError(t, err, "FAIL: Batch insert failed")
require.Equal(t, http.StatusOK, status, "FAIL: Batch insert returned status %d", status)
// Count all batch rows
queryReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/query",
Body: map[string]interface{}{
"sql": fmt.Sprintf("SELECT COUNT(*) as cnt FROM %s WHERE value LIKE 'batch_%%'", table),
},
}
body, status, err := queryReq.Do(ctx)
require.NoError(t, err, "FAIL: Count query failed")
require.Equal(t, http.StatusOK, status, "FAIL: Count query returned status %d", status)
var queryResp map[string]interface{}
e2e.DecodeJSON(body, &queryResp)
if rows, ok := queryResp["rows"].([]interface{}); ok && len(rows) > 0 {
row := rows[0].([]interface{})
count := int(row[0].(float64))
require.Equal(t, 10, count, "FAIL: Expected 10 batch rows, got %d", count)
}
t.Logf(" ✓ All 10 batch writes are readable")
})
}
// TestRQLite_TransactionAtomicity verifies transactions are atomic.
func TestRQLite_TransactionAtomicity(t *testing.T) {
e2e.SkipIfMissingGateway(t)
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()
table := e2e.GenerateTableName()
// Cleanup
defer func() {
dropReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/drop-table",
Body: map[string]interface{}{"table": table},
}
dropReq.Do(context.Background())
}()
// Create table
createReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/create-table",
Body: map[string]interface{}{
"schema": fmt.Sprintf(
"CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY, value TEXT UNIQUE)",
table,
),
},
}
_, status, err := createReq.Do(ctx)
require.NoError(t, err, "FAIL: Create table failed")
require.True(t, status == http.StatusCreated || status == http.StatusOK,
"FAIL: Create table returned status %d", status)
t.Run("Successful_transaction_commits_all", func(t *testing.T) {
txReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/transaction",
Body: map[string]interface{}{
"statements": []string{
fmt.Sprintf("INSERT INTO %s (value) VALUES ('tx_val_1')", table),
fmt.Sprintf("INSERT INTO %s (value) VALUES ('tx_val_2')", table),
fmt.Sprintf("INSERT INTO %s (value) VALUES ('tx_val_3')", table),
},
},
}
_, status, err := txReq.Do(ctx)
require.NoError(t, err, "FAIL: Transaction request failed")
require.Equal(t, http.StatusOK, status, "FAIL: Transaction returned status %d", status)
// Verify all 3 rows exist
queryReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/query",
Body: map[string]interface{}{
"sql": fmt.Sprintf("SELECT COUNT(*) FROM %s WHERE value LIKE 'tx_val_%%'", table),
},
}
body, _, _ := queryReq.Do(ctx)
var queryResp map[string]interface{}
e2e.DecodeJSON(body, &queryResp)
if rows, ok := queryResp["rows"].([]interface{}); ok && len(rows) > 0 {
row := rows[0].([]interface{})
count := int(row[0].(float64))
require.Equal(t, 3, count, "FAIL: Transaction didn't commit all 3 rows - got %d", count)
}
t.Logf(" ✓ Transaction committed all 3 rows atomically")
})
t.Run("Updates_preserve_consistency", func(t *testing.T) {
// Update a value
updateReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/transaction",
Body: map[string]interface{}{
"statements": []string{
fmt.Sprintf("UPDATE %s SET value = 'tx_val_1_updated' WHERE value = 'tx_val_1'", table),
},
},
}
_, status, err := updateReq.Do(ctx)
require.NoError(t, err, "FAIL: Update request failed")
require.Equal(t, http.StatusOK, status, "FAIL: Update returned status %d", status)
// Verify update took effect
queryReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/query",
Body: map[string]interface{}{
"sql": fmt.Sprintf("SELECT value FROM %s WHERE value = 'tx_val_1_updated'", table),
},
}
body, _, _ := queryReq.Do(ctx)
var queryResp map[string]interface{}
e2e.DecodeJSON(body, &queryResp)
count, _ := queryResp["count"].(float64)
require.Equal(t, float64(1), count, "FAIL: Update didn't take effect")
t.Logf(" ✓ Update preserved consistency")
})
}
// TestRQLite_ConcurrentWrites verifies the cluster handles concurrent writes correctly.
func TestRQLite_ConcurrentWrites(t *testing.T) {
e2e.SkipIfMissingGateway(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
table := e2e.GenerateTableName()
// Cleanup
defer func() {
dropReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/drop-table",
Body: map[string]interface{}{"table": table},
}
dropReq.Do(context.Background())
}()
// Create table
createReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/create-table",
Body: map[string]interface{}{
"schema": fmt.Sprintf(
"CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY, worker INTEGER, seq INTEGER)",
table,
),
},
}
_, status, err := createReq.Do(ctx)
require.NoError(t, err, "FAIL: Create table failed")
require.True(t, status == http.StatusCreated || status == http.StatusOK,
"FAIL: Create table returned status %d", status)
t.Run("Concurrent_inserts_all_succeed", func(t *testing.T) {
numWorkers := 5
insertsPerWorker := 10
expectedTotal := numWorkers * insertsPerWorker
var wg sync.WaitGroup
errChan := make(chan error, numWorkers*insertsPerWorker)
for w := 0; w < numWorkers; w++ {
wg.Add(1)
go func(workerID int) {
defer wg.Done()
for i := 0; i < insertsPerWorker; i++ {
insertReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/transaction",
Body: map[string]interface{}{
"statements": []string{
fmt.Sprintf("INSERT INTO %s (worker, seq) VALUES (%d, %d)", table, workerID, i),
},
},
}
_, status, err := insertReq.Do(ctx)
if err != nil {
errChan <- fmt.Errorf("worker %d insert %d failed: %w", workerID, i, err)
return
}
if status != http.StatusOK {
errChan <- fmt.Errorf("worker %d insert %d got status %d", workerID, i, status)
return
}
}
}(w)
}
wg.Wait()
close(errChan)
// Collect errors
var errors []error
for err := range errChan {
errors = append(errors, err)
}
require.Empty(t, errors, "FAIL: %d concurrent inserts failed: %v", len(errors), errors)
// Verify total count
queryReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: e2e.GetGatewayURL() + "/v1/rqlite/query",
Body: map[string]interface{}{
"sql": fmt.Sprintf("SELECT COUNT(*) FROM %s", table),
},
}
body, _, _ := queryReq.Do(ctx)
var queryResp map[string]interface{}
e2e.DecodeJSON(body, &queryResp)
if rows, ok := queryResp["rows"].([]interface{}); ok && len(rows) > 0 {
row := rows[0].([]interface{})
count := int(row[0].(float64))
require.Equal(t, expectedTotal, count,
"FAIL: Expected %d total rows from concurrent inserts, got %d", expectedTotal, count)
}
t.Logf(" ✓ All %d concurrent inserts succeeded", expectedTotal)
})
}
// TestRQLite_NamespaceClusterOperations verifies RQLite works in namespace clusters.
func TestRQLite_NamespaceClusterOperations(t *testing.T) {
// Create a new namespace
namespace := fmt.Sprintf("rqlite-test-%d", time.Now().UnixNano())
env, err := e2e.LoadTestEnvWithNamespace(namespace)
require.NoError(t, err, "FAIL: Could not create namespace for RQLite test")
require.NotEmpty(t, env.APIKey, "FAIL: No API key - namespace provisioning failed")
t.Logf("Created namespace %s", namespace)
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()
table := e2e.GenerateTableName()
// Cleanup
defer func() {
dropReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: env.GatewayURL + "/v1/rqlite/drop-table",
Body: map[string]interface{}{"table": table},
Headers: map[string]string{"Authorization": "Bearer " + env.APIKey},
}
dropReq.Do(context.Background())
}()
t.Run("Namespace_RQLite_create_insert_query", func(t *testing.T) {
// Create table in namespace cluster
createReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: env.GatewayURL + "/v1/rqlite/create-table",
Headers: map[string]string{"Authorization": "Bearer " + env.APIKey},
Body: map[string]interface{}{
"schema": fmt.Sprintf(
"CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY, value TEXT)",
table,
),
},
}
_, status, err := createReq.Do(ctx)
require.NoError(t, err, "FAIL: Create table in namespace failed")
require.True(t, status == http.StatusCreated || status == http.StatusOK,
"FAIL: Create table returned status %d", status)
// Insert data
uniqueValue := fmt.Sprintf("ns_value_%d", time.Now().UnixNano())
insertReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: env.GatewayURL + "/v1/rqlite/transaction",
Headers: map[string]string{"Authorization": "Bearer " + env.APIKey},
Body: map[string]interface{}{
"statements": []string{
fmt.Sprintf("INSERT INTO %s (value) VALUES ('%s')", table, uniqueValue),
},
},
}
_, status, err = insertReq.Do(ctx)
require.NoError(t, err, "FAIL: Insert in namespace failed")
require.Equal(t, http.StatusOK, status, "FAIL: Insert returned status %d", status)
// Query data
queryReq := &e2e.HTTPRequest{
Method: http.MethodPost,
URL: env.GatewayURL + "/v1/rqlite/query",
Headers: map[string]string{"Authorization": "Bearer " + env.APIKey},
Body: map[string]interface{}{
"sql": fmt.Sprintf("SELECT value FROM %s WHERE value = '%s'", table, uniqueValue),
},
}
body, status, err := queryReq.Do(ctx)
require.NoError(t, err, "FAIL: Query in namespace failed")
require.Equal(t, http.StatusOK, status, "FAIL: Query returned status %d", status)
var queryResp map[string]interface{}
e2e.DecodeJSON(body, &queryResp)
count, _ := queryResp["count"].(float64)
require.Equal(t, float64(1), count, "FAIL: Data not found in namespace cluster")
t.Logf(" ✓ Namespace RQLite operations work correctly")
})
}

View File

@ -1,333 +0,0 @@
//go:build e2e && production
package production
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net"
"net/http"
"os"
"os/exec"
"path/filepath"
"testing"
"time"
"github.com/DeBrosOfficial/network/e2e"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestDNS_MultipleARecords verifies that deploying with replicas creates
// multiple A records (one per node) for DNS round-robin.
func TestDNS_MultipleARecords(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err)
if len(env.Config.Servers) < 2 {
t.Skip("Requires at least 2 servers")
}
deploymentName := fmt.Sprintf("dns-multi-%d", time.Now().Unix())
tarballPath := filepath.Join("../../testdata/apps/react-app")
deploymentID := e2e.CreateTestDeployment(t, env, deploymentName, tarballPath)
require.NotEmpty(t, deploymentID)
defer func() {
if !env.SkipCleanup {
e2e.DeleteDeployment(t, env, deploymentID)
}
}()
// Wait for replica setup and DNS propagation
time.Sleep(15 * time.Second)
t.Run("DNS returns multiple IPs", func(t *testing.T) {
deployment := e2e.GetDeployment(t, env, deploymentID)
subdomain, _ := deployment["subdomain"].(string)
if subdomain == "" {
subdomain = deploymentName
}
fqdn := fmt.Sprintf("%s.%s", subdomain, env.BaseDomain)
// Query nameserver directly
nameserverIP := env.Config.Servers[0].IP
resolver := &net.Resolver{
PreferGo: true,
Dial: func(ctx context.Context, network, address string) (net.Conn, error) {
d := net.Dialer{Timeout: 10 * time.Second}
return d.Dial("udp", nameserverIP+":53")
},
}
ctx := context.Background()
ips, err := resolver.LookupHost(ctx, fqdn)
if err != nil {
t.Logf("DNS lookup failed for %s: %v", fqdn, err)
t.Log("Trying net.LookupHost instead...")
ips, err = net.LookupHost(fqdn)
}
if err != nil {
t.Logf("DNS lookup failed: %v (DNS may not be propagated yet)", err)
t.Skip("DNS not yet propagated")
}
t.Logf("DNS returned %d IPs for %s: %v", len(ips), fqdn, ips)
assert.GreaterOrEqual(t, len(ips), 2,
"Should have at least 2 A records (home + replica)")
// Verify returned IPs are from our server list
serverIPs := e2e.GetServerIPs(env.Config)
for _, ip := range ips {
assert.Contains(t, serverIPs, ip,
"DNS IP %s should be one of our servers", ip)
}
})
}
// TestDNS_CleanupOnDelete verifies that deleting a deployment removes all
// DNS records (both home and replica A records).
func TestDNS_CleanupOnDelete(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err)
deploymentName := fmt.Sprintf("dns-cleanup-%d", time.Now().Unix())
tarballPath := filepath.Join("../../testdata/apps/react-app")
deploymentID := e2e.CreateTestDeployment(t, env, deploymentName, tarballPath)
require.NotEmpty(t, deploymentID)
// Wait for DNS
time.Sleep(10 * time.Second)
// Get subdomain before deletion
deployment := e2e.GetDeployment(t, env, deploymentID)
subdomain, _ := deployment["subdomain"].(string)
if subdomain == "" {
subdomain = deploymentName
}
fqdn := fmt.Sprintf("%s.%s", subdomain, env.BaseDomain)
// Verify DNS works before deletion
t.Run("DNS resolves before deletion", func(t *testing.T) {
nodeURL := extractNodeURLProd(t, deployment)
if nodeURL == "" {
t.Skip("No URL to test")
}
domain := extractDomainProd(nodeURL)
req, _ := http.NewRequest("GET", env.GatewayURL+"/", nil)
req.Host = domain
resp, err := env.HTTPClient.Do(req)
if err == nil {
resp.Body.Close()
t.Logf("Pre-delete: status=%d", resp.StatusCode)
}
})
// Delete
e2e.DeleteDeployment(t, env, deploymentID)
time.Sleep(10 * time.Second)
t.Run("DNS records removed after deletion", func(t *testing.T) {
ips, err := net.LookupHost(fqdn)
if err != nil {
t.Logf("DNS lookup failed (expected): %v", err)
return // Good — no records
}
// If we still get IPs, they might be cached. Log and warn.
if len(ips) > 0 {
t.Logf("WARNING: DNS still returns %d IPs after deletion (may be cached): %v", len(ips), ips)
}
})
}
// TestDNS_CustomSubdomain verifies that deploying with a custom subdomain
// creates DNS records using the custom name.
func TestDNS_CustomSubdomain(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err)
deploymentName := fmt.Sprintf("dns-custom-%d", time.Now().Unix())
tarballPath := filepath.Join("../../testdata/apps/react-app")
deploymentID := createDeploymentWithSubdomain(t, env, deploymentName, tarballPath)
require.NotEmpty(t, deploymentID)
defer func() {
if !env.SkipCleanup {
e2e.DeleteDeployment(t, env, deploymentID)
}
}()
time.Sleep(10 * time.Second)
t.Run("Deployment has subdomain with random suffix", func(t *testing.T) {
deployment := e2e.GetDeployment(t, env, deploymentID)
subdomain, _ := deployment["subdomain"].(string)
require.NotEmpty(t, subdomain, "Deployment should have a subdomain")
t.Logf("Subdomain: %s", subdomain)
// Verify the subdomain starts with the deployment name
assert.Contains(t, subdomain, deploymentName[:10],
"Subdomain should relate to deployment name")
})
}
// TestDNS_RedeployPreservesSubdomain verifies that updating a deployment
// does not change the subdomain/DNS.
func TestDNS_RedeployPreservesSubdomain(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err)
deploymentName := fmt.Sprintf("dns-preserve-%d", time.Now().Unix())
tarballPath := filepath.Join("../../testdata/apps/react-app")
deploymentID := e2e.CreateTestDeployment(t, env, deploymentName, tarballPath)
require.NotEmpty(t, deploymentID)
defer func() {
if !env.SkipCleanup {
e2e.DeleteDeployment(t, env, deploymentID)
}
}()
time.Sleep(5 * time.Second)
// Get original subdomain
deployment := e2e.GetDeployment(t, env, deploymentID)
originalSubdomain, _ := deployment["subdomain"].(string)
originalURLs := deployment["urls"]
t.Logf("Original subdomain: %s, urls: %v", originalSubdomain, originalURLs)
// Update
updateStaticDeploymentProd(t, env, deploymentName, tarballPath)
time.Sleep(5 * time.Second)
// Verify subdomain unchanged
t.Run("Subdomain unchanged after update", func(t *testing.T) {
updated := e2e.GetDeployment(t, env, deploymentID)
updatedSubdomain, _ := updated["subdomain"].(string)
assert.Equal(t, originalSubdomain, updatedSubdomain,
"Subdomain should not change after update")
t.Logf("After update: subdomain=%s", updatedSubdomain)
})
}
func createDeploymentWithSubdomain(t *testing.T, env *e2e.E2ETestEnv, name, tarballPath string) string {
t.Helper()
var fileData []byte
info, err := os.Stat(tarballPath)
require.NoError(t, err)
if info.IsDir() {
fileData, err = exec.Command("tar", "-czf", "-", "-C", tarballPath, ".").Output()
require.NoError(t, err)
} else {
file, err := os.Open(tarballPath)
require.NoError(t, err)
defer file.Close()
fileData, _ = io.ReadAll(file)
}
body := &bytes.Buffer{}
boundary := "----WebKitFormBoundary7MA4YWxkTrZu0gW"
body.WriteString("--" + boundary + "\r\n")
body.WriteString("Content-Disposition: form-data; name=\"name\"\r\n\r\n")
body.WriteString(name + "\r\n")
body.WriteString("--" + boundary + "\r\n")
body.WriteString("Content-Disposition: form-data; name=\"tarball\"; filename=\"app.tar.gz\"\r\n")
body.WriteString("Content-Type: application/gzip\r\n\r\n")
body.Write(fileData)
body.WriteString("\r\n--" + boundary + "--\r\n")
req, err := http.NewRequest("POST", env.GatewayURL+"/v1/deployments/static/upload", body)
require.NoError(t, err)
req.Header.Set("Content-Type", "multipart/form-data; boundary="+boundary)
req.Header.Set("Authorization", "Bearer "+env.APIKey)
resp, err := env.HTTPClient.Do(req)
require.NoError(t, err)
defer resp.Body.Close()
if resp.StatusCode != http.StatusCreated {
bodyBytes, _ := io.ReadAll(resp.Body)
t.Fatalf("Upload failed: status=%d body=%s", resp.StatusCode, string(bodyBytes))
}
var result map[string]interface{}
json.NewDecoder(resp.Body).Decode(&result)
if id, ok := result["deployment_id"].(string); ok {
return id
}
if id, ok := result["id"].(string); ok {
return id
}
t.Fatalf("No id in response: %+v", result)
return ""
}
func updateStaticDeploymentProd(t *testing.T, env *e2e.E2ETestEnv, name, tarballPath string) {
t.Helper()
var fileData []byte
info, err := os.Stat(tarballPath)
require.NoError(t, err)
if info.IsDir() {
fileData, err = exec.Command("tar", "-czf", "-", "-C", tarballPath, ".").Output()
require.NoError(t, err)
} else {
file, err := os.Open(tarballPath)
require.NoError(t, err)
defer file.Close()
fileData, _ = io.ReadAll(file)
}
body := &bytes.Buffer{}
boundary := "----WebKitFormBoundary7MA4YWxkTrZu0gW"
body.WriteString("--" + boundary + "\r\n")
body.WriteString("Content-Disposition: form-data; name=\"name\"\r\n\r\n")
body.WriteString(name + "\r\n")
body.WriteString("--" + boundary + "\r\n")
body.WriteString("Content-Disposition: form-data; name=\"tarball\"; filename=\"app.tar.gz\"\r\n")
body.WriteString("Content-Type: application/gzip\r\n\r\n")
body.Write(fileData)
body.WriteString("\r\n--" + boundary + "--\r\n")
req, err := http.NewRequest("POST", env.GatewayURL+"/v1/deployments/static/update", body)
require.NoError(t, err)
req.Header.Set("Content-Type", "multipart/form-data; boundary="+boundary)
req.Header.Set("Authorization", "Bearer "+env.APIKey)
resp, err := env.HTTPClient.Do(req)
require.NoError(t, err)
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
bodyBytes, _ := io.ReadAll(resp.Body)
t.Fatalf("Update failed: status=%d body=%s", resp.StatusCode, string(bodyBytes))
}
}

View File

@ -1,121 +0,0 @@
//go:build e2e && production
package production
import (
"context"
"fmt"
"net"
"path/filepath"
"testing"
"time"
"github.com/DeBrosOfficial/network/e2e"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestDNS_DeploymentResolution tests that deployed applications are resolvable via DNS
// This test requires production mode as it performs real DNS lookups
func TestDNS_DeploymentResolution(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "Failed to load test environment")
deploymentName := fmt.Sprintf("dns-test-%d", time.Now().Unix())
tarballPath := filepath.Join("../../testdata/apps/react-app")
deploymentID := e2e.CreateTestDeployment(t, env, deploymentName, tarballPath)
defer func() {
if !env.SkipCleanup {
e2e.DeleteDeployment(t, env, deploymentID)
}
}()
// Wait for DNS propagation
domain := env.BuildDeploymentDomain(deploymentName)
t.Logf("Testing DNS resolution for: %s", domain)
t.Run("DNS resolves to valid server IP", func(t *testing.T) {
// Allow some time for DNS propagation
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
var ips []string
var err error
// Poll for DNS resolution
for {
select {
case <-ctx.Done():
t.Fatalf("DNS resolution timeout for %s", domain)
default:
ips, err = net.LookupHost(domain)
if err == nil && len(ips) > 0 {
goto resolved
}
time.Sleep(2 * time.Second)
}
}
resolved:
t.Logf("DNS resolved: %s -> %v", domain, ips)
assert.NotEmpty(t, ips, "Should have IP addresses")
// Verify resolved IP is one of our servers
validIPs := e2e.GetServerIPs(env.Config)
if len(validIPs) > 0 {
found := false
for _, ip := range ips {
for _, validIP := range validIPs {
if ip == validIP {
found = true
break
}
}
}
assert.True(t, found, "Resolved IP should be one of our servers: %v (valid: %v)", ips, validIPs)
}
})
}
// TestDNS_BaseDomainResolution tests that the base domain resolves correctly
func TestDNS_BaseDomainResolution(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "Failed to load test environment")
t.Run("Base domain resolves", func(t *testing.T) {
ips, err := net.LookupHost(env.BaseDomain)
require.NoError(t, err, "Base domain %s should resolve", env.BaseDomain)
assert.NotEmpty(t, ips, "Should have IP addresses")
t.Logf("✓ Base domain %s resolves to: %v", env.BaseDomain, ips)
})
}
// TestDNS_WildcardResolution tests wildcard DNS for arbitrary subdomains
func TestDNS_WildcardResolution(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "Failed to load test environment")
t.Run("Wildcard subdomain resolves", func(t *testing.T) {
// Test with a random subdomain that doesn't exist as a deployment
randomSubdomain := fmt.Sprintf("random-test-%d.%s", time.Now().UnixNano(), env.BaseDomain)
ips, err := net.LookupHost(randomSubdomain)
if err != nil {
// DNS may not support wildcard - that's OK for some setups
t.Logf("⚠ Wildcard DNS not configured (this may be expected): %v", err)
t.Skip("Wildcard DNS not configured")
return
}
assert.NotEmpty(t, ips, "Wildcard subdomain should resolve")
t.Logf("✓ Wildcard subdomain resolves: %s -> %v", randomSubdomain, ips)
})
}

View File

@ -1,181 +0,0 @@
//go:build e2e && production
package production
import (
"context"
"net"
"strings"
"testing"
"time"
"github.com/DeBrosOfficial/network/e2e"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestNameserver_NSRecords tests that NS records are properly configured for the domain
func TestNameserver_NSRecords(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "Failed to load test environment")
if len(env.Config.Nameservers) == 0 {
t.Skip("No nameservers configured in e2e/config.yaml")
}
t.Run("NS records exist for base domain", func(t *testing.T) {
nsRecords, err := net.LookupNS(env.BaseDomain)
require.NoError(t, err, "Should be able to look up NS records for %s", env.BaseDomain)
require.NotEmpty(t, nsRecords, "Should have NS records")
t.Logf("Found %d NS records for %s:", len(nsRecords), env.BaseDomain)
for _, ns := range nsRecords {
t.Logf(" - %s", ns.Host)
}
// Verify our nameservers are listed
for _, expected := range env.Config.Nameservers {
found := false
for _, ns := range nsRecords {
// Trim trailing dot for comparison
nsHost := strings.TrimSuffix(ns.Host, ".")
if nsHost == expected || nsHost == expected+"." {
found = true
break
}
}
assert.True(t, found, "NS records should include %s", expected)
}
})
}
// TestNameserver_GlueRecords tests that glue records point to correct IPs
func TestNameserver_GlueRecords(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "Failed to load test environment")
if len(env.Config.Nameservers) == 0 {
t.Skip("No nameservers configured in e2e/config.yaml")
}
nameserverServers := e2e.GetNameserverServers(env.Config)
if len(nameserverServers) == 0 {
t.Skip("No servers marked as nameservers in config")
}
t.Run("Glue records resolve to correct IPs", func(t *testing.T) {
for i, ns := range env.Config.Nameservers {
ips, err := net.LookupHost(ns)
require.NoError(t, err, "Nameserver %s should resolve", ns)
require.NotEmpty(t, ips, "Nameserver %s should have IP addresses", ns)
t.Logf("Nameserver %s resolves to: %v", ns, ips)
// If we have the expected IP, verify it matches
if i < len(nameserverServers) {
expectedIP := nameserverServers[i].IP
found := false
for _, ip := range ips {
if ip == expectedIP {
found = true
break
}
}
assert.True(t, found, "Glue record for %s should point to %s (got %v)", ns, expectedIP, ips)
}
}
})
}
// TestNameserver_CoreDNSResponds tests that our CoreDNS servers respond to queries
func TestNameserver_CoreDNSResponds(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "Failed to load test environment")
nameserverServers := e2e.GetNameserverServers(env.Config)
if len(nameserverServers) == 0 {
t.Skip("No servers marked as nameservers in config")
}
t.Run("CoreDNS servers respond to queries", func(t *testing.T) {
for _, server := range nameserverServers {
t.Run(server.Name, func(t *testing.T) {
// Create a custom resolver that queries this specific server
resolver := &net.Resolver{
PreferGo: true,
Dial: func(ctx context.Context, network, address string) (net.Conn, error) {
d := net.Dialer{
Timeout: 5 * time.Second,
}
return d.DialContext(ctx, "udp", server.IP+":53")
},
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
// Query the base domain
ips, err := resolver.LookupHost(ctx, env.BaseDomain)
if err != nil {
// Log the error but don't fail - server might be configured differently
t.Logf("⚠ CoreDNS at %s (%s) query error: %v", server.Name, server.IP, err)
return
}
t.Logf("✓ CoreDNS at %s (%s) responded: %s -> %v", server.Name, server.IP, env.BaseDomain, ips)
assert.NotEmpty(t, ips, "CoreDNS should return IP addresses")
})
}
})
}
// TestNameserver_QueryLatency tests DNS query latency from our nameservers
func TestNameserver_QueryLatency(t *testing.T) {
e2e.SkipIfLocal(t)
env, err := e2e.LoadTestEnv()
require.NoError(t, err, "Failed to load test environment")
nameserverServers := e2e.GetNameserverServers(env.Config)
if len(nameserverServers) == 0 {
t.Skip("No servers marked as nameservers in config")
}
t.Run("DNS query latency is acceptable", func(t *testing.T) {
for _, server := range nameserverServers {
resolver := &net.Resolver{
PreferGo: true,
Dial: func(ctx context.Context, network, address string) (net.Conn, error) {
d := net.Dialer{
Timeout: 5 * time.Second,
}
return d.DialContext(ctx, "udp", server.IP+":53")
},
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
start := time.Now()
_, err := resolver.LookupHost(ctx, env.BaseDomain)
latency := time.Since(start)
if err != nil {
t.Logf("⚠ Query to %s failed: %v", server.Name, err)
continue
}
t.Logf("DNS latency from %s (%s): %v", server.Name, server.IP, latency)
// DNS queries should be fast (under 500ms is reasonable)
assert.Less(t, latency, 500*time.Millisecond,
"DNS query to %s should complete in under 500ms", server.Name)
}
})
}

BIN
inspector Executable file

Binary file not shown.

158
pkg/cli/inspect_command.go Normal file
View File

@ -0,0 +1,158 @@
package cli
import (
"bufio"
"context"
"flag"
"fmt"
"os"
"strings"
"time"
"github.com/DeBrosOfficial/network/pkg/inspector"
// Import checks package so init() registers the checkers
_ "github.com/DeBrosOfficial/network/pkg/inspector/checks"
)
// loadDotEnv loads key=value pairs from a .env file into os environment.
// Only sets vars that are not already set (env takes precedence over file).
func loadDotEnv(path string) {
f, err := os.Open(path)
if err != nil {
return // .env is optional
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" || strings.HasPrefix(line, "#") {
continue
}
eq := strings.IndexByte(line, '=')
if eq < 1 {
continue
}
key := line[:eq]
value := line[eq+1:]
// Only set if not already in environment
if os.Getenv(key) == "" {
os.Setenv(key, value)
}
}
}
// HandleInspectCommand handles the "orama inspect" command.
func HandleInspectCommand(args []string) {
// Load .env file from current directory (only sets unset vars)
loadDotEnv(".env")
fs := flag.NewFlagSet("inspect", flag.ExitOnError)
configPath := fs.String("config", "scripts/remote-nodes.conf", "Path to remote-nodes.conf")
env := fs.String("env", "", "Environment to inspect (devnet, testnet)")
subsystem := fs.String("subsystem", "all", "Subsystem to inspect (rqlite,olric,ipfs,dns,wg,system,network,all)")
format := fs.String("format", "table", "Output format (table, json)")
timeout := fs.Duration("timeout", 30*time.Second, "SSH command timeout")
verbose := fs.Bool("verbose", false, "Verbose output")
// AI flags
aiEnabled := fs.Bool("ai", false, "Enable AI analysis of failures")
aiModel := fs.String("model", "moonshotai/kimi-k2.5", "OpenRouter model for AI analysis")
aiAPIKey := fs.String("api-key", "", "OpenRouter API key (or OPENROUTER_API_KEY env)")
fs.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: orama inspect [flags]\n\n")
fmt.Fprintf(os.Stderr, "Inspect cluster health by SSHing into nodes and running checks.\n\n")
fmt.Fprintf(os.Stderr, "Flags:\n")
fs.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " orama inspect --env devnet\n")
fmt.Fprintf(os.Stderr, " orama inspect --env devnet --subsystem rqlite\n")
fmt.Fprintf(os.Stderr, " orama inspect --env devnet --ai\n")
fmt.Fprintf(os.Stderr, " orama inspect --env devnet --ai --model openai/gpt-4o\n")
}
if err := fs.Parse(args); err != nil {
os.Exit(1)
}
if *env == "" {
fmt.Fprintf(os.Stderr, "Error: --env is required (devnet, testnet)\n")
os.Exit(1)
}
// Load nodes
nodes, err := inspector.LoadNodes(*configPath)
if err != nil {
fmt.Fprintf(os.Stderr, "Error loading config: %v\n", err)
os.Exit(1)
}
// Filter by environment
nodes = inspector.FilterByEnv(nodes, *env)
if len(nodes) == 0 {
fmt.Fprintf(os.Stderr, "Error: no nodes found for environment %q\n", *env)
os.Exit(1)
}
// Parse subsystems
var subsystems []string
if *subsystem != "all" {
subsystems = strings.Split(*subsystem, ",")
}
fmt.Printf("Inspecting %d %s nodes", len(nodes), *env)
if len(subsystems) > 0 {
fmt.Printf(" [%s]", strings.Join(subsystems, ","))
}
if *aiEnabled {
fmt.Printf(" (AI: %s)", *aiModel)
}
fmt.Printf("...\n\n")
// Phase 1: Collect
ctx, cancel := context.WithTimeout(context.Background(), *timeout+10*time.Second)
defer cancel()
if *verbose {
fmt.Printf("Collecting data from %d nodes (timeout: %s)...\n", len(nodes), timeout)
}
data := inspector.Collect(ctx, nodes, subsystems, *verbose)
if *verbose {
fmt.Printf("Collection complete in %.1fs\n\n", data.Duration.Seconds())
}
// Phase 2: Check
results := inspector.RunChecks(data, subsystems)
// Phase 3: Report
switch *format {
case "json":
inspector.PrintJSON(results, os.Stdout)
default:
inspector.PrintTable(results, os.Stdout)
}
// Phase 4: AI Analysis (if enabled and there are failures or warnings)
if *aiEnabled {
issues := results.FailuresAndWarnings()
if len(issues) == 0 {
fmt.Printf("\nAll checks passed — no AI analysis needed.\n")
} else {
fmt.Printf("\nAnalyzing %d issues with %s...\n", len(issues), *aiModel)
analysis, err := inspector.Analyze(results, data, *aiModel, *aiAPIKey)
if err != nil {
fmt.Fprintf(os.Stderr, "\nAI analysis failed: %v\n", err)
} else {
inspector.PrintAnalysis(analysis, os.Stdout)
}
}
}
// Exit with non-zero if any failures
if failures := results.Failures(); len(failures) > 0 {
os.Exit(1)
}
}

229
pkg/inspector/analyzer.go Normal file
View File

@ -0,0 +1,229 @@
package inspector
import (
"bytes"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"strings"
"time"
)
// AnalysisResult holds the AI's analysis of check failures.
type AnalysisResult struct {
Model string
Analysis string
Duration time.Duration
}
// Analyze sends failures and cluster context to OpenRouter for AI analysis.
func Analyze(results *Results, data *ClusterData, model, apiKey string) (*AnalysisResult, error) {
if apiKey == "" {
apiKey = os.Getenv("OPENROUTER_API_KEY")
}
if apiKey == "" {
return nil, fmt.Errorf("no API key: set --api-key or OPENROUTER_API_KEY env")
}
// Build the prompt with failures, warnings, and cluster context
prompt := buildAnalysisPrompt(results, data)
start := time.Now()
response, err := callOpenRouter(model, apiKey, prompt)
if err != nil {
return nil, fmt.Errorf("OpenRouter API call failed: %w", err)
}
return &AnalysisResult{
Model: model,
Analysis: response,
Duration: time.Since(start),
}, nil
}
func buildAnalysisPrompt(results *Results, data *ClusterData) string {
var b strings.Builder
// System context
b.WriteString("You are a distributed systems expert analyzing health check results for an Orama Network cluster.\n")
b.WriteString("The cluster runs RQLite (Raft consensus), Olric (distributed cache), IPFS, CoreDNS, and WireGuard.\n\n")
// Cluster overview
b.WriteString("## Cluster Overview\n")
b.WriteString(fmt.Sprintf("Nodes inspected: %d\n", len(data.Nodes)))
for host, nd := range data.Nodes {
b.WriteString(fmt.Sprintf("- %s (role: %s)\n", host, nd.Node.Role))
}
b.WriteString("\n")
// Summary
passed, failed, warned, skipped := results.Summary()
b.WriteString(fmt.Sprintf("## Check Results: %d passed, %d failed, %d warnings, %d skipped\n\n", passed, failed, warned, skipped))
// List all failures
failures := results.Failures()
if len(failures) > 0 {
b.WriteString("## Failures (CRITICAL)\n")
for _, f := range failures {
node := f.Node
if node == "" {
node = "cluster-wide"
}
b.WriteString(fmt.Sprintf("- [%s] %s (%s): %s\n", f.Severity, f.Name, node, f.Message))
}
b.WriteString("\n")
}
// List all warnings
warnings := results.FailuresAndWarnings()
warningsOnly := make([]CheckResult, 0)
for _, w := range warnings {
if w.Status == StatusWarn {
warningsOnly = append(warningsOnly, w)
}
}
if len(warningsOnly) > 0 {
b.WriteString("## Warnings\n")
for _, w := range warningsOnly {
node := w.Node
if node == "" {
node = "cluster-wide"
}
b.WriteString(fmt.Sprintf("- [%s] %s (%s): %s\n", w.Severity, w.Name, node, w.Message))
}
b.WriteString("\n")
}
// Add raw RQLite status for context (condensed)
b.WriteString("## Raw Cluster Data (condensed)\n")
for host, nd := range data.Nodes {
if nd.RQLite != nil && nd.RQLite.Status != nil {
s := nd.RQLite.Status
b.WriteString(fmt.Sprintf("### %s (RQLite)\n", host))
b.WriteString(fmt.Sprintf(" raft_state=%s term=%d applied=%d commit=%d leader=%s peers=%d voter=%v\n",
s.RaftState, s.Term, s.AppliedIndex, s.CommitIndex, s.LeaderNodeID, s.NumPeers, s.Voter))
if nd.RQLite.Nodes != nil {
b.WriteString(fmt.Sprintf(" /nodes reports %d members:", len(nd.RQLite.Nodes)))
for addr, n := range nd.RQLite.Nodes {
reachable := "ok"
if !n.Reachable {
reachable = "UNREACHABLE"
}
leader := ""
if n.Leader {
leader = " LEADER"
}
b.WriteString(fmt.Sprintf(" %s(%s%s)", addr, reachable, leader))
}
b.WriteString("\n")
}
}
}
b.WriteString("\n## Task\n")
b.WriteString("Analyze the failures and warnings above. For each issue:\n")
b.WriteString("1. Explain the root cause\n")
b.WriteString("2. Assess the severity and impact on the cluster\n")
b.WriteString("3. Suggest specific commands or actions to fix it\n")
b.WriteString("\nBe concise and actionable. Group related issues together. Use markdown formatting.\n")
return b.String()
}
// OpenRouter API types (OpenAI-compatible)
type openRouterRequest struct {
Model string `json:"model"`
Messages []openRouterMessage `json:"messages"`
}
type openRouterMessage struct {
Role string `json:"role"`
Content string `json:"content"`
}
type openRouterResponse struct {
Choices []struct {
Message struct {
Content string `json:"content"`
} `json:"message"`
} `json:"choices"`
Error *struct {
Message string `json:"message"`
Code int `json:"code"`
} `json:"error"`
}
func callOpenRouter(model, apiKey, prompt string) (string, error) {
reqBody := openRouterRequest{
Model: model,
Messages: []openRouterMessage{
{Role: "user", Content: prompt},
},
}
jsonBody, err := json.Marshal(reqBody)
if err != nil {
return "", fmt.Errorf("marshal request: %w", err)
}
req, err := http.NewRequest("POST", "https://openrouter.ai/api/v1/chat/completions", bytes.NewReader(jsonBody))
if err != nil {
return "", fmt.Errorf("create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+apiKey)
client := &http.Client{Timeout: 120 * time.Second}
resp, err := client.Do(req)
if err != nil {
return "", fmt.Errorf("HTTP request: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("read response: %w", err)
}
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("API returned %d: %s", resp.StatusCode, string(body))
}
var orResp openRouterResponse
if err := json.Unmarshal(body, &orResp); err != nil {
return "", fmt.Errorf("unmarshal response: %w", err)
}
if orResp.Error != nil {
return "", fmt.Errorf("API error: %s", orResp.Error.Message)
}
if len(orResp.Choices) == 0 {
return "", fmt.Errorf("no choices in response (raw: %s)", truncate(string(body), 500))
}
content := orResp.Choices[0].Message.Content
if strings.TrimSpace(content) == "" {
return "", fmt.Errorf("model returned empty response (raw: %s)", truncate(string(body), 500))
}
return content, nil
}
func truncate(s string, max int) string {
if len(s) <= max {
return s
}
return s[:max] + "..."
}
// PrintAnalysis writes the AI analysis to the output.
func PrintAnalysis(analysis *AnalysisResult, w io.Writer) {
fmt.Fprintf(w, "\n## AI Analysis (%s)\n", analysis.Model)
fmt.Fprintf(w, "%s\n", strings.Repeat("-", 70))
fmt.Fprintf(w, "%s\n", analysis.Analysis)
fmt.Fprintf(w, "\n(Analysis took %.1fs)\n", analysis.Duration.Seconds())
}

172
pkg/inspector/checker.go Normal file
View File

@ -0,0 +1,172 @@
package inspector
import (
"time"
)
// Severity levels for check results.
type Severity int
const (
Low Severity = iota
Medium
High
Critical
)
func (s Severity) String() string {
switch s {
case Low:
return "LOW"
case Medium:
return "MEDIUM"
case High:
return "HIGH"
case Critical:
return "CRITICAL"
default:
return "UNKNOWN"
}
}
// Status represents the outcome of a check.
type Status string
const (
StatusPass Status = "pass"
StatusFail Status = "fail"
StatusWarn Status = "warn"
StatusSkip Status = "skip"
)
// CheckResult holds the outcome of a single health check.
type CheckResult struct {
ID string `json:"id"` // e.g. "rqlite.leader_exists"
Name string `json:"name"` // "Cluster has exactly one leader"
Subsystem string `json:"subsystem"` // "rqlite"
Severity Severity `json:"severity"`
Status Status `json:"status"`
Message string `json:"message"` // human-readable detail
Node string `json:"node,omitempty"` // which node (empty for cluster-wide)
}
// Results holds all check outcomes.
type Results struct {
Checks []CheckResult `json:"checks"`
Duration time.Duration `json:"duration"`
}
// Summary returns counts by status.
func (r *Results) Summary() (passed, failed, warned, skipped int) {
for _, c := range r.Checks {
switch c.Status {
case StatusPass:
passed++
case StatusFail:
failed++
case StatusWarn:
warned++
case StatusSkip:
skipped++
}
}
return
}
// Failures returns only failed checks.
func (r *Results) Failures() []CheckResult {
var out []CheckResult
for _, c := range r.Checks {
if c.Status == StatusFail {
out = append(out, c)
}
}
return out
}
// FailuresAndWarnings returns failed and warning checks.
func (r *Results) FailuresAndWarnings() []CheckResult {
var out []CheckResult
for _, c := range r.Checks {
if c.Status == StatusFail || c.Status == StatusWarn {
out = append(out, c)
}
}
return out
}
// CheckFunc is the signature for a subsystem check function.
type CheckFunc func(data *ClusterData) []CheckResult
// SubsystemCheckers maps subsystem names to their check functions.
// Populated by checks/ package init or by explicit registration.
var SubsystemCheckers = map[string]CheckFunc{}
// RegisterChecker registers a check function for a subsystem.
func RegisterChecker(subsystem string, fn CheckFunc) {
SubsystemCheckers[subsystem] = fn
}
// RunChecks executes checks for the requested subsystems against collected data.
func RunChecks(data *ClusterData, subsystems []string) *Results {
start := time.Now()
results := &Results{}
shouldCheck := func(name string) bool {
if len(subsystems) == 0 {
return true
}
for _, s := range subsystems {
if s == name || s == "all" {
return true
}
// Alias: "wg" matches "wireguard"
if s == "wg" && name == "wireguard" {
return true
}
}
return false
}
for name, fn := range SubsystemCheckers {
if shouldCheck(name) {
checks := fn(data)
results.Checks = append(results.Checks, checks...)
}
}
results.Duration = time.Since(start)
return results
}
// Pass creates a passing check result.
func Pass(id, name, subsystem, node, msg string, sev Severity) CheckResult {
return CheckResult{
ID: id, Name: name, Subsystem: subsystem,
Severity: sev, Status: StatusPass, Message: msg, Node: node,
}
}
// Fail creates a failing check result.
func Fail(id, name, subsystem, node, msg string, sev Severity) CheckResult {
return CheckResult{
ID: id, Name: name, Subsystem: subsystem,
Severity: sev, Status: StatusFail, Message: msg, Node: node,
}
}
// Warn creates a warning check result.
func Warn(id, name, subsystem, node, msg string, sev Severity) CheckResult {
return CheckResult{
ID: id, Name: name, Subsystem: subsystem,
Severity: sev, Status: StatusWarn, Message: msg, Node: node,
}
}
// Skip creates a skipped check result.
func Skip(id, name, subsystem, node, msg string, sev Severity) CheckResult {
return CheckResult{
ID: id, Name: name, Subsystem: subsystem,
Severity: sev, Status: StatusSkip, Message: msg, Node: node,
}
}

View File

@ -0,0 +1,190 @@
package inspector
import (
"testing"
"time"
)
func TestSummary(t *testing.T) {
r := &Results{
Checks: []CheckResult{
{ID: "a", Status: StatusPass},
{ID: "b", Status: StatusPass},
{ID: "c", Status: StatusFail},
{ID: "d", Status: StatusWarn},
{ID: "e", Status: StatusSkip},
{ID: "f", Status: StatusPass},
},
}
passed, failed, warned, skipped := r.Summary()
if passed != 3 {
t.Errorf("passed: want 3, got %d", passed)
}
if failed != 1 {
t.Errorf("failed: want 1, got %d", failed)
}
if warned != 1 {
t.Errorf("warned: want 1, got %d", warned)
}
if skipped != 1 {
t.Errorf("skipped: want 1, got %d", skipped)
}
}
func TestFailures(t *testing.T) {
r := &Results{
Checks: []CheckResult{
{ID: "a", Status: StatusPass},
{ID: "b", Status: StatusFail},
{ID: "c", Status: StatusWarn},
{ID: "d", Status: StatusFail},
},
}
failures := r.Failures()
if len(failures) != 2 {
t.Fatalf("want 2 failures, got %d", len(failures))
}
for _, f := range failures {
if f.Status != StatusFail {
t.Errorf("expected StatusFail, got %s for check %s", f.Status, f.ID)
}
}
}
func TestFailuresAndWarnings(t *testing.T) {
r := &Results{
Checks: []CheckResult{
{ID: "a", Status: StatusPass},
{ID: "b", Status: StatusFail},
{ID: "c", Status: StatusWarn},
{ID: "d", Status: StatusSkip},
},
}
fw := r.FailuresAndWarnings()
if len(fw) != 2 {
t.Fatalf("want 2 failures+warnings, got %d", len(fw))
}
}
func TestPass(t *testing.T) {
c := Pass("test.id", "Test Name", "sub", "node1", "msg", Critical)
if c.Status != StatusPass {
t.Errorf("want pass, got %s", c.Status)
}
if c.Severity != Critical {
t.Errorf("want Critical, got %s", c.Severity)
}
if c.Node != "node1" {
t.Errorf("want node1, got %s", c.Node)
}
}
func TestFail(t *testing.T) {
c := Fail("test.id", "Test Name", "sub", "", "msg", High)
if c.Status != StatusFail {
t.Errorf("want fail, got %s", c.Status)
}
if c.Node != "" {
t.Errorf("want empty node, got %q", c.Node)
}
}
func TestWarn(t *testing.T) {
c := Warn("test.id", "Test Name", "sub", "n", "msg", Medium)
if c.Status != StatusWarn {
t.Errorf("want warn, got %s", c.Status)
}
}
func TestSkip(t *testing.T) {
c := Skip("test.id", "Test Name", "sub", "n", "msg", Low)
if c.Status != StatusSkip {
t.Errorf("want skip, got %s", c.Status)
}
}
func TestSeverityString(t *testing.T) {
tests := []struct {
sev Severity
want string
}{
{Low, "LOW"},
{Medium, "MEDIUM"},
{High, "HIGH"},
{Critical, "CRITICAL"},
{Severity(99), "UNKNOWN"},
}
for _, tt := range tests {
t.Run(tt.want, func(t *testing.T) {
if got := tt.sev.String(); got != tt.want {
t.Errorf("Severity(%d).String() = %q, want %q", tt.sev, got, tt.want)
}
})
}
}
func TestRunChecks_EmptyData(t *testing.T) {
data := &ClusterData{
Nodes: map[string]*NodeData{},
Duration: time.Second,
}
results := RunChecks(data, nil)
if results == nil {
t.Fatal("RunChecks returned nil")
}
// Should not panic and should return a valid Results
}
func TestRunChecks_FilterBySubsystem(t *testing.T) {
// Register a test checker
called := map[string]bool{}
SubsystemCheckers["test_sub_a"] = func(data *ClusterData) []CheckResult {
called["a"] = true
return []CheckResult{Pass("a.1", "A1", "test_sub_a", "", "ok", Low)}
}
SubsystemCheckers["test_sub_b"] = func(data *ClusterData) []CheckResult {
called["b"] = true
return []CheckResult{Pass("b.1", "B1", "test_sub_b", "", "ok", Low)}
}
defer delete(SubsystemCheckers, "test_sub_a")
defer delete(SubsystemCheckers, "test_sub_b")
data := &ClusterData{Nodes: map[string]*NodeData{}}
// Filter to only "test_sub_a"
results := RunChecks(data, []string{"test_sub_a"})
if !called["a"] {
t.Error("test_sub_a checker was not called")
}
if called["b"] {
t.Error("test_sub_b checker should not have been called")
}
found := false
for _, c := range results.Checks {
if c.ID == "a.1" {
found = true
}
if c.Subsystem == "test_sub_b" {
t.Error("should not have checks from test_sub_b")
}
}
if !found {
t.Error("expected check a.1 in results")
}
}
func TestRunChecks_AliasWG(t *testing.T) {
called := false
SubsystemCheckers["wireguard"] = func(data *ClusterData) []CheckResult {
called = true
return nil
}
defer delete(SubsystemCheckers, "wireguard")
data := &ClusterData{Nodes: map[string]*NodeData{}}
RunChecks(data, []string{"wg"})
if !called {
t.Error("wireguard checker not called via 'wg' alias")
}
}

224
pkg/inspector/checks/dns.go Normal file
View File

@ -0,0 +1,224 @@
package checks
import (
"fmt"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func init() {
inspector.RegisterChecker("dns", CheckDNS)
}
const dnsSub = "dns"
// CheckDNS runs all DNS/CoreDNS health checks against cluster data.
func CheckDNS(data *inspector.ClusterData) []inspector.CheckResult {
var results []inspector.CheckResult
for _, nd := range data.Nodes {
if nd.DNS == nil {
continue
}
results = append(results, checkDNSPerNode(nd)...)
}
results = append(results, checkDNSCrossNode(data)...)
return results
}
func checkDNSPerNode(nd *inspector.NodeData) []inspector.CheckResult {
var r []inspector.CheckResult
dns := nd.DNS
node := nd.Node.Name()
// 4.1 CoreDNS service running
if dns.CoreDNSActive {
r = append(r, inspector.Pass("dns.coredns_active", "CoreDNS service active", dnsSub, node,
"coredns is active", inspector.Critical))
} else {
r = append(r, inspector.Fail("dns.coredns_active", "CoreDNS service active", dnsSub, node,
"coredns is not active", inspector.Critical))
return r
}
// 4.47 Caddy service running
if dns.CaddyActive {
r = append(r, inspector.Pass("dns.caddy_active", "Caddy service active", dnsSub, node,
"caddy is active", inspector.Critical))
} else {
r = append(r, inspector.Fail("dns.caddy_active", "Caddy service active", dnsSub, node,
"caddy is not active", inspector.Critical))
}
// 4.8 DNS port 53 bound
if dns.Port53Bound {
r = append(r, inspector.Pass("dns.port_53", "DNS port 53 bound", dnsSub, node,
"UDP 53 is listening", inspector.Critical))
} else {
r = append(r, inspector.Fail("dns.port_53", "DNS port 53 bound", dnsSub, node,
"UDP 53 is NOT listening", inspector.Critical))
}
// 4.10 HTTP port 80
if dns.Port80Bound {
r = append(r, inspector.Pass("dns.port_80", "HTTP port 80 bound", dnsSub, node,
"TCP 80 is listening", inspector.High))
} else {
r = append(r, inspector.Warn("dns.port_80", "HTTP port 80 bound", dnsSub, node,
"TCP 80 is NOT listening", inspector.High))
}
// 4.11 HTTPS port 443
if dns.Port443Bound {
r = append(r, inspector.Pass("dns.port_443", "HTTPS port 443 bound", dnsSub, node,
"TCP 443 is listening", inspector.Critical))
} else {
r = append(r, inspector.Fail("dns.port_443", "HTTPS port 443 bound", dnsSub, node,
"TCP 443 is NOT listening", inspector.Critical))
}
// 4.3 CoreDNS memory
if dns.CoreDNSMemMB > 0 {
if dns.CoreDNSMemMB < 100 {
r = append(r, inspector.Pass("dns.coredns_memory", "CoreDNS memory healthy", dnsSub, node,
fmt.Sprintf("RSS=%dMB", dns.CoreDNSMemMB), inspector.Medium))
} else if dns.CoreDNSMemMB < 200 {
r = append(r, inspector.Warn("dns.coredns_memory", "CoreDNS memory healthy", dnsSub, node,
fmt.Sprintf("RSS=%dMB (elevated)", dns.CoreDNSMemMB), inspector.Medium))
} else {
r = append(r, inspector.Fail("dns.coredns_memory", "CoreDNS memory healthy", dnsSub, node,
fmt.Sprintf("RSS=%dMB (high)", dns.CoreDNSMemMB), inspector.High))
}
}
// 4.4 CoreDNS restart count
if dns.CoreDNSRestarts == 0 {
r = append(r, inspector.Pass("dns.coredns_restarts", "CoreDNS low restart count", dnsSub, node,
"NRestarts=0", inspector.High))
} else if dns.CoreDNSRestarts <= 3 {
r = append(r, inspector.Warn("dns.coredns_restarts", "CoreDNS low restart count", dnsSub, node,
fmt.Sprintf("NRestarts=%d", dns.CoreDNSRestarts), inspector.High))
} else {
r = append(r, inspector.Fail("dns.coredns_restarts", "CoreDNS low restart count", dnsSub, node,
fmt.Sprintf("NRestarts=%d (crash-looping?)", dns.CoreDNSRestarts), inspector.High))
}
// 4.7 CoreDNS log error rate
if dns.LogErrors == 0 {
r = append(r, inspector.Pass("dns.coredns_log_errors", "No recent CoreDNS errors", dnsSub, node,
"0 errors in last 5 minutes", inspector.High))
} else if dns.LogErrors < 5 {
r = append(r, inspector.Warn("dns.coredns_log_errors", "No recent CoreDNS errors", dnsSub, node,
fmt.Sprintf("%d errors in last 5 minutes", dns.LogErrors), inspector.High))
} else {
r = append(r, inspector.Fail("dns.coredns_log_errors", "No recent CoreDNS errors", dnsSub, node,
fmt.Sprintf("%d errors in last 5 minutes", dns.LogErrors), inspector.High))
}
// 4.14 Corefile exists
if dns.CorefileExists {
r = append(r, inspector.Pass("dns.corefile_exists", "Corefile exists", dnsSub, node,
"/etc/coredns/Corefile present", inspector.High))
} else {
r = append(r, inspector.Fail("dns.corefile_exists", "Corefile exists", dnsSub, node,
"/etc/coredns/Corefile NOT found", inspector.High))
}
// 4.20 SOA resolution
if dns.SOAResolves {
r = append(r, inspector.Pass("dns.soa_resolves", "SOA record resolves", dnsSub, node,
"dig SOA returned result", inspector.Critical))
} else {
r = append(r, inspector.Fail("dns.soa_resolves", "SOA record resolves", dnsSub, node,
"dig SOA returned no result", inspector.Critical))
}
// 4.21 NS records resolve
if dns.NSResolves {
r = append(r, inspector.Pass("dns.ns_resolves", "NS records resolve", dnsSub, node,
fmt.Sprintf("%d NS records returned", dns.NSRecordCount), inspector.Critical))
} else {
r = append(r, inspector.Fail("dns.ns_resolves", "NS records resolve", dnsSub, node,
"dig NS returned no results", inspector.Critical))
}
// 4.23 Wildcard DNS resolution
if dns.WildcardResolves {
r = append(r, inspector.Pass("dns.wildcard_resolves", "Wildcard DNS resolves", dnsSub, node,
"test-wildcard.<domain> returned IP", inspector.Critical))
} else {
r = append(r, inspector.Fail("dns.wildcard_resolves", "Wildcard DNS resolves", dnsSub, node,
"test-wildcard.<domain> returned no IP", inspector.Critical))
}
// 4.24 Base domain A record
if dns.BaseAResolves {
r = append(r, inspector.Pass("dns.base_a_resolves", "Base domain A record resolves", dnsSub, node,
"<domain> A record returned IP", inspector.High))
} else {
r = append(r, inspector.Warn("dns.base_a_resolves", "Base domain A record resolves", dnsSub, node,
"<domain> A record returned no IP", inspector.High))
}
// 4.50 TLS certificate - base domain
if dns.BaseTLSDaysLeft >= 0 {
if dns.BaseTLSDaysLeft > 30 {
r = append(r, inspector.Pass("dns.tls_base", "Base domain TLS cert valid", dnsSub, node,
fmt.Sprintf("%d days until expiry", dns.BaseTLSDaysLeft), inspector.Critical))
} else if dns.BaseTLSDaysLeft > 7 {
r = append(r, inspector.Warn("dns.tls_base", "Base domain TLS cert valid", dnsSub, node,
fmt.Sprintf("%d days until expiry (expiring soon)", dns.BaseTLSDaysLeft), inspector.Critical))
} else {
r = append(r, inspector.Fail("dns.tls_base", "Base domain TLS cert valid", dnsSub, node,
fmt.Sprintf("%d days until expiry (CRITICAL)", dns.BaseTLSDaysLeft), inspector.Critical))
}
}
// 4.51 TLS certificate - wildcard
if dns.WildTLSDaysLeft >= 0 {
if dns.WildTLSDaysLeft > 30 {
r = append(r, inspector.Pass("dns.tls_wildcard", "Wildcard TLS cert valid", dnsSub, node,
fmt.Sprintf("%d days until expiry", dns.WildTLSDaysLeft), inspector.Critical))
} else if dns.WildTLSDaysLeft > 7 {
r = append(r, inspector.Warn("dns.tls_wildcard", "Wildcard TLS cert valid", dnsSub, node,
fmt.Sprintf("%d days until expiry (expiring soon)", dns.WildTLSDaysLeft), inspector.Critical))
} else {
r = append(r, inspector.Fail("dns.tls_wildcard", "Wildcard TLS cert valid", dnsSub, node,
fmt.Sprintf("%d days until expiry (CRITICAL)", dns.WildTLSDaysLeft), inspector.Critical))
}
}
return r
}
func checkDNSCrossNode(data *inspector.ClusterData) []inspector.CheckResult {
var r []inspector.CheckResult
activeCount := 0
totalNS := 0
for _, nd := range data.Nodes {
if nd.DNS == nil {
continue
}
totalNS++
if nd.DNS.CoreDNSActive {
activeCount++
}
}
if totalNS == 0 {
return r
}
if activeCount == totalNS {
r = append(r, inspector.Pass("dns.all_ns_active", "All nameservers running CoreDNS", dnsSub, "",
fmt.Sprintf("%d/%d nameservers active", activeCount, totalNS), inspector.Critical))
} else {
r = append(r, inspector.Fail("dns.all_ns_active", "All nameservers running CoreDNS", dnsSub, "",
fmt.Sprintf("%d/%d nameservers active", activeCount, totalNS), inspector.Critical))
}
return r
}

View File

@ -0,0 +1,232 @@
package checks
import (
"testing"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func TestCheckDNS_CoreDNSInactive(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
nd.DNS = &inspector.DNSData{CoreDNSActive: false}
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckDNS(data)
expectStatus(t, results, "dns.coredns_active", inspector.StatusFail)
// Early return — no port checks
if findCheck(results, "dns.port_53") != nil {
t.Error("should not check ports when CoreDNS inactive")
}
}
func TestCheckDNS_HealthyNode(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
nd.DNS = &inspector.DNSData{
CoreDNSActive: true,
CaddyActive: true,
Port53Bound: true,
Port80Bound: true,
Port443Bound: true,
CoreDNSMemMB: 50,
CoreDNSRestarts: 0,
LogErrors: 0,
CorefileExists: true,
SOAResolves: true,
NSResolves: true,
NSRecordCount: 3,
WildcardResolves: true,
BaseAResolves: true,
BaseTLSDaysLeft: 60,
WildTLSDaysLeft: 60,
}
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckDNS(data)
expectStatus(t, results, "dns.coredns_active", inspector.StatusPass)
expectStatus(t, results, "dns.caddy_active", inspector.StatusPass)
expectStatus(t, results, "dns.port_53", inspector.StatusPass)
expectStatus(t, results, "dns.port_80", inspector.StatusPass)
expectStatus(t, results, "dns.port_443", inspector.StatusPass)
expectStatus(t, results, "dns.coredns_memory", inspector.StatusPass)
expectStatus(t, results, "dns.coredns_restarts", inspector.StatusPass)
expectStatus(t, results, "dns.coredns_log_errors", inspector.StatusPass)
expectStatus(t, results, "dns.corefile_exists", inspector.StatusPass)
expectStatus(t, results, "dns.soa_resolves", inspector.StatusPass)
expectStatus(t, results, "dns.ns_resolves", inspector.StatusPass)
expectStatus(t, results, "dns.wildcard_resolves", inspector.StatusPass)
expectStatus(t, results, "dns.base_a_resolves", inspector.StatusPass)
expectStatus(t, results, "dns.tls_base", inspector.StatusPass)
expectStatus(t, results, "dns.tls_wildcard", inspector.StatusPass)
}
func TestCheckDNS_PortsFailing(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
nd.DNS = &inspector.DNSData{
CoreDNSActive: true,
Port53Bound: false,
Port80Bound: false,
Port443Bound: false,
}
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckDNS(data)
expectStatus(t, results, "dns.port_53", inspector.StatusFail)
expectStatus(t, results, "dns.port_80", inspector.StatusWarn)
expectStatus(t, results, "dns.port_443", inspector.StatusFail)
}
func TestCheckDNS_Memory(t *testing.T) {
tests := []struct {
name string
memMB int
status inspector.Status
}{
{"healthy", 50, inspector.StatusPass},
{"elevated", 150, inspector.StatusWarn},
{"high", 250, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
nd.DNS = &inspector.DNSData{CoreDNSActive: true, CoreDNSMemMB: tt.memMB}
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckDNS(data)
expectStatus(t, results, "dns.coredns_memory", tt.status)
})
}
}
func TestCheckDNS_Restarts(t *testing.T) {
tests := []struct {
name string
restarts int
status inspector.Status
}{
{"zero", 0, inspector.StatusPass},
{"few", 2, inspector.StatusWarn},
{"many", 5, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
nd.DNS = &inspector.DNSData{CoreDNSActive: true, CoreDNSRestarts: tt.restarts}
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckDNS(data)
expectStatus(t, results, "dns.coredns_restarts", tt.status)
})
}
}
func TestCheckDNS_LogErrors(t *testing.T) {
tests := []struct {
name string
errors int
status inspector.Status
}{
{"none", 0, inspector.StatusPass},
{"few", 3, inspector.StatusWarn},
{"many", 10, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
nd.DNS = &inspector.DNSData{CoreDNSActive: true, LogErrors: tt.errors}
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckDNS(data)
expectStatus(t, results, "dns.coredns_log_errors", tt.status)
})
}
}
func TestCheckDNS_TLSExpiry(t *testing.T) {
tests := []struct {
name string
days int
status inspector.Status
}{
{"healthy", 60, inspector.StatusPass},
{"expiring soon", 20, inspector.StatusWarn},
{"critical", 3, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
nd.DNS = &inspector.DNSData{
CoreDNSActive: true,
BaseTLSDaysLeft: tt.days,
WildTLSDaysLeft: tt.days,
}
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckDNS(data)
expectStatus(t, results, "dns.tls_base", tt.status)
expectStatus(t, results, "dns.tls_wildcard", tt.status)
})
}
}
func TestCheckDNS_TLSNotChecked(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
nd.DNS = &inspector.DNSData{
CoreDNSActive: true,
BaseTLSDaysLeft: -1,
WildTLSDaysLeft: -1,
}
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckDNS(data)
// TLS checks should not be emitted when days == -1
if findCheck(results, "dns.tls_base") != nil {
t.Error("should not emit tls_base when days == -1")
}
}
func TestCheckDNS_ResolutionFailures(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
nd.DNS = &inspector.DNSData{
CoreDNSActive: true,
SOAResolves: false,
NSResolves: false,
WildcardResolves: false,
BaseAResolves: false,
}
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckDNS(data)
expectStatus(t, results, "dns.soa_resolves", inspector.StatusFail)
expectStatus(t, results, "dns.ns_resolves", inspector.StatusFail)
expectStatus(t, results, "dns.wildcard_resolves", inspector.StatusFail)
expectStatus(t, results, "dns.base_a_resolves", inspector.StatusWarn)
}
func TestCheckDNS_CrossNode_AllActive(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
for _, host := range []string{"5.5.5.5", "6.6.6.6", "7.7.7.7"} {
nd := makeNodeData(host, "nameserver-ns1")
nd.DNS = &inspector.DNSData{CoreDNSActive: true}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckDNS(data)
expectStatus(t, results, "dns.all_ns_active", inspector.StatusPass)
}
func TestCheckDNS_CrossNode_PartialActive(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
active := []bool{true, true, false}
for i, host := range []string{"5.5.5.5", "6.6.6.6", "7.7.7.7"} {
nd := makeNodeData(host, "nameserver-ns1")
nd.DNS = &inspector.DNSData{CoreDNSActive: active[i]}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckDNS(data)
expectStatus(t, results, "dns.all_ns_active", inspector.StatusFail)
}
func TestCheckDNS_NilData(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckDNS(data)
if len(results) != 0 {
t.Errorf("expected 0 results for nil DNS data, got %d", len(results))
}
}

View File

@ -0,0 +1,74 @@
package checks
import (
"testing"
"time"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
// makeNode creates a test Node with the given host and role.
func makeNode(host, role string) inspector.Node {
return inspector.Node{
Environment: "devnet",
User: "ubuntu",
Host: host,
Password: "test",
Role: role,
}
}
// makeNodeData creates a NodeData with a node but no subsystem data.
func makeNodeData(host, role string) *inspector.NodeData {
return &inspector.NodeData{
Node: makeNode(host, role),
}
}
// makeCluster creates a ClusterData from a map of host → NodeData.
func makeCluster(nodes map[string]*inspector.NodeData) *inspector.ClusterData {
return &inspector.ClusterData{
Nodes: nodes,
Duration: 1 * time.Second,
}
}
// countByStatus counts results with the given status.
func countByStatus(results []inspector.CheckResult, status inspector.Status) int {
n := 0
for _, r := range results {
if r.Status == status {
n++
}
}
return n
}
// findCheck returns a pointer to the first check matching the given ID, or nil.
func findCheck(results []inspector.CheckResult, id string) *inspector.CheckResult {
for i := range results {
if results[i].ID == id {
return &results[i]
}
}
return nil
}
// requireCheck finds a check by ID and fails the test if not found.
func requireCheck(t *testing.T, results []inspector.CheckResult, id string) inspector.CheckResult {
t.Helper()
c := findCheck(results, id)
if c == nil {
t.Fatalf("check %q not found in %d results", id, len(results))
}
return *c
}
// expectStatus asserts that a check with the given ID has the expected status.
func expectStatus(t *testing.T, results []inspector.CheckResult, id string, status inspector.Status) {
t.Helper()
c := requireCheck(t, results, id)
if c.Status != status {
t.Errorf("check %q: want status=%s, got status=%s (msg=%s)", id, status, c.Status, c.Message)
}
}

View File

@ -0,0 +1,232 @@
package checks
import (
"fmt"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func init() {
inspector.RegisterChecker("ipfs", CheckIPFS)
}
const ipfsSub = "ipfs"
// CheckIPFS runs all IPFS health checks against cluster data.
func CheckIPFS(data *inspector.ClusterData) []inspector.CheckResult {
var results []inspector.CheckResult
for _, nd := range data.Nodes {
if nd.IPFS == nil {
continue
}
results = append(results, checkIPFSPerNode(nd, data)...)
}
results = append(results, checkIPFSCrossNode(data)...)
return results
}
func checkIPFSPerNode(nd *inspector.NodeData, data *inspector.ClusterData) []inspector.CheckResult {
var r []inspector.CheckResult
ipfs := nd.IPFS
node := nd.Node.Name()
// 3.1 IPFS daemon running
if ipfs.DaemonActive {
r = append(r, inspector.Pass("ipfs.daemon_active", "IPFS daemon active", ipfsSub, node,
"debros-ipfs is active", inspector.Critical))
} else {
r = append(r, inspector.Fail("ipfs.daemon_active", "IPFS daemon active", ipfsSub, node,
"debros-ipfs is not active", inspector.Critical))
return r
}
// 3.2 IPFS Cluster running
if ipfs.ClusterActive {
r = append(r, inspector.Pass("ipfs.cluster_active", "IPFS Cluster active", ipfsSub, node,
"debros-ipfs-cluster is active", inspector.Critical))
} else {
r = append(r, inspector.Fail("ipfs.cluster_active", "IPFS Cluster active", ipfsSub, node,
"debros-ipfs-cluster is not active", inspector.Critical))
}
// 3.6 Swarm peer count
expectedNodes := countIPFSNodes(data)
if ipfs.SwarmPeerCount >= 0 {
expectedPeers := expectedNodes - 1
if expectedPeers < 0 {
expectedPeers = 0
}
if ipfs.SwarmPeerCount >= expectedPeers {
r = append(r, inspector.Pass("ipfs.swarm_peers", "Swarm peer count sufficient", ipfsSub, node,
fmt.Sprintf("peers=%d (expected >=%d)", ipfs.SwarmPeerCount, expectedPeers), inspector.High))
} else if ipfs.SwarmPeerCount > 0 {
r = append(r, inspector.Warn("ipfs.swarm_peers", "Swarm peer count sufficient", ipfsSub, node,
fmt.Sprintf("peers=%d (expected >=%d)", ipfs.SwarmPeerCount, expectedPeers), inspector.High))
} else {
r = append(r, inspector.Fail("ipfs.swarm_peers", "Swarm peer count sufficient", ipfsSub, node,
fmt.Sprintf("peers=%d (isolated!)", ipfs.SwarmPeerCount), inspector.Critical))
}
}
// 3.12 Cluster peer count
if ipfs.ClusterPeerCount >= 0 {
if ipfs.ClusterPeerCount >= expectedNodes {
r = append(r, inspector.Pass("ipfs.cluster_peers", "Cluster peer count matches expected", ipfsSub, node,
fmt.Sprintf("cluster_peers=%d (expected=%d)", ipfs.ClusterPeerCount, expectedNodes), inspector.Critical))
} else {
r = append(r, inspector.Warn("ipfs.cluster_peers", "Cluster peer count matches expected", ipfsSub, node,
fmt.Sprintf("cluster_peers=%d (expected=%d)", ipfs.ClusterPeerCount, expectedNodes), inspector.Critical))
}
}
// 3.14 Cluster peer errors
if ipfs.ClusterErrors == 0 {
r = append(r, inspector.Pass("ipfs.cluster_errors", "No cluster peer errors", ipfsSub, node,
"all cluster peers healthy", inspector.Critical))
} else {
r = append(r, inspector.Fail("ipfs.cluster_errors", "No cluster peer errors", ipfsSub, node,
fmt.Sprintf("%d peers reporting errors", ipfs.ClusterErrors), inspector.Critical))
}
// 3.20 Repo size vs max
if ipfs.RepoMaxBytes > 0 && ipfs.RepoSizeBytes > 0 {
pct := float64(ipfs.RepoSizeBytes) / float64(ipfs.RepoMaxBytes) * 100
sizeMB := ipfs.RepoSizeBytes / (1024 * 1024)
maxMB := ipfs.RepoMaxBytes / (1024 * 1024)
if pct < 80 {
r = append(r, inspector.Pass("ipfs.repo_size", "Repo size below limit", ipfsSub, node,
fmt.Sprintf("repo=%dMB/%dMB (%.0f%%)", sizeMB, maxMB, pct), inspector.High))
} else if pct < 95 {
r = append(r, inspector.Warn("ipfs.repo_size", "Repo size below limit", ipfsSub, node,
fmt.Sprintf("repo=%dMB/%dMB (%.0f%%)", sizeMB, maxMB, pct), inspector.High))
} else {
r = append(r, inspector.Fail("ipfs.repo_size", "Repo size below limit", ipfsSub, node,
fmt.Sprintf("repo=%dMB/%dMB (%.0f%% NEARLY FULL)", sizeMB, maxMB, pct), inspector.Critical))
}
}
// 3.3 Version
if ipfs.KuboVersion != "" && ipfs.KuboVersion != "unknown" {
r = append(r, inspector.Pass("ipfs.kubo_version", "Kubo version reported", ipfsSub, node,
fmt.Sprintf("kubo=%s", ipfs.KuboVersion), inspector.Low))
}
if ipfs.ClusterVersion != "" && ipfs.ClusterVersion != "unknown" {
r = append(r, inspector.Pass("ipfs.cluster_version", "Cluster version reported", ipfsSub, node,
fmt.Sprintf("cluster=%s", ipfs.ClusterVersion), inspector.Low))
}
// 3.29 Swarm key exists (private swarm)
if ipfs.HasSwarmKey {
r = append(r, inspector.Pass("ipfs.swarm_key", "Swarm key exists (private swarm)", ipfsSub, node,
"swarm.key present", inspector.Critical))
} else {
r = append(r, inspector.Fail("ipfs.swarm_key", "Swarm key exists (private swarm)", ipfsSub, node,
"swarm.key NOT found", inspector.Critical))
}
// 3.30 Bootstrap empty (private swarm)
if ipfs.BootstrapEmpty {
r = append(r, inspector.Pass("ipfs.bootstrap_empty", "Bootstrap list empty (private)", ipfsSub, node,
"no public bootstrap peers", inspector.High))
} else {
r = append(r, inspector.Warn("ipfs.bootstrap_empty", "Bootstrap list empty (private)", ipfsSub, node,
"bootstrap list is not empty (should be empty for private swarm)", inspector.High))
}
return r
}
func checkIPFSCrossNode(data *inspector.ClusterData) []inspector.CheckResult {
var r []inspector.CheckResult
type nodeInfo struct {
name string
ipfs *inspector.IPFSData
}
var nodes []nodeInfo
for _, nd := range data.Nodes {
if nd.IPFS != nil && nd.IPFS.DaemonActive {
nodes = append(nodes, nodeInfo{name: nd.Node.Name(), ipfs: nd.IPFS})
}
}
if len(nodes) < 2 {
return r
}
// Version consistency
kuboVersions := map[string][]string{}
clusterVersions := map[string][]string{}
for _, n := range nodes {
if n.ipfs.KuboVersion != "" && n.ipfs.KuboVersion != "unknown" {
kuboVersions[n.ipfs.KuboVersion] = append(kuboVersions[n.ipfs.KuboVersion], n.name)
}
if n.ipfs.ClusterVersion != "" && n.ipfs.ClusterVersion != "unknown" {
clusterVersions[n.ipfs.ClusterVersion] = append(clusterVersions[n.ipfs.ClusterVersion], n.name)
}
}
if len(kuboVersions) == 1 {
for v := range kuboVersions {
r = append(r, inspector.Pass("ipfs.kubo_version_consistent", "Kubo version consistent", ipfsSub, "",
fmt.Sprintf("version=%s across %d nodes", v, len(nodes)), inspector.Medium))
}
} else if len(kuboVersions) > 1 {
r = append(r, inspector.Warn("ipfs.kubo_version_consistent", "Kubo version consistent", ipfsSub, "",
fmt.Sprintf("%d different versions", len(kuboVersions)), inspector.Medium))
}
if len(clusterVersions) == 1 {
for v := range clusterVersions {
r = append(r, inspector.Pass("ipfs.cluster_version_consistent", "Cluster version consistent", ipfsSub, "",
fmt.Sprintf("version=%s across %d nodes", v, len(nodes)), inspector.Medium))
}
} else if len(clusterVersions) > 1 {
r = append(r, inspector.Warn("ipfs.cluster_version_consistent", "Cluster version consistent", ipfsSub, "",
fmt.Sprintf("%d different versions", len(clusterVersions)), inspector.Medium))
}
// Repo size convergence
var sizes []int64
for _, n := range nodes {
if n.ipfs.RepoSizeBytes > 0 {
sizes = append(sizes, n.ipfs.RepoSizeBytes)
}
}
if len(sizes) >= 2 {
minSize, maxSize := sizes[0], sizes[0]
for _, s := range sizes[1:] {
if s < minSize {
minSize = s
}
if s > maxSize {
maxSize = s
}
}
if minSize > 0 {
ratio := float64(maxSize) / float64(minSize)
if ratio <= 2.0 {
r = append(r, inspector.Pass("ipfs.repo_convergence", "Repo size similar across nodes", ipfsSub, "",
fmt.Sprintf("ratio=%.1fx", ratio), inspector.Medium))
} else {
r = append(r, inspector.Warn("ipfs.repo_convergence", "Repo size similar across nodes", ipfsSub, "",
fmt.Sprintf("ratio=%.1fx (diverged)", ratio), inspector.Medium))
}
}
}
return r
}
func countIPFSNodes(data *inspector.ClusterData) int {
count := 0
for _, nd := range data.Nodes {
if nd.IPFS != nil {
count++
}
}
return count
}

View File

@ -0,0 +1,183 @@
package checks
import (
"testing"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func TestCheckIPFS_DaemonInactive(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.IPFS = &inspector.IPFSData{DaemonActive: false}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckIPFS(data)
expectStatus(t, results, "ipfs.daemon_active", inspector.StatusFail)
// Early return — no swarm peer checks
if findCheck(results, "ipfs.swarm_peers") != nil {
t.Error("should not check swarm_peers when daemon inactive")
}
}
func TestCheckIPFS_HealthyNode(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.IPFS = &inspector.IPFSData{
DaemonActive: true,
ClusterActive: true,
SwarmPeerCount: 0, // single node: expected peers = 0
ClusterPeerCount: 1, // single node cluster
ClusterErrors: 0,
RepoSizeBytes: 500 * 1024 * 1024, // 500MB
RepoMaxBytes: 1024 * 1024 * 1024, // 1GB
KuboVersion: "0.22.0",
ClusterVersion: "1.0.8",
HasSwarmKey: true,
BootstrapEmpty: true,
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckIPFS(data)
expectStatus(t, results, "ipfs.daemon_active", inspector.StatusPass)
expectStatus(t, results, "ipfs.cluster_active", inspector.StatusPass)
expectStatus(t, results, "ipfs.swarm_peers", inspector.StatusPass)
expectStatus(t, results, "ipfs.cluster_peers", inspector.StatusPass)
expectStatus(t, results, "ipfs.cluster_errors", inspector.StatusPass)
expectStatus(t, results, "ipfs.repo_size", inspector.StatusPass)
expectStatus(t, results, "ipfs.swarm_key", inspector.StatusPass)
expectStatus(t, results, "ipfs.bootstrap_empty", inspector.StatusPass)
}
func TestCheckIPFS_SwarmPeers(t *testing.T) {
// Single-node cluster: expected peers = 0
t.Run("enough", func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 2}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckIPFS(data)
// swarm_peers=2, expected=0 → pass
expectStatus(t, results, "ipfs.swarm_peers", inspector.StatusPass)
})
t.Run("low but nonzero", func(t *testing.T) {
// 3-node cluster: expected peers = 2 per node
nd := makeNodeData("1.1.1.1", "node")
nd.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 1} // has 1, expects 2
nd2 := makeNodeData("2.2.2.2", "node")
nd2.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 2}
nd3 := makeNodeData("3.3.3.3", "node")
nd3.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 2}
data := makeCluster(map[string]*inspector.NodeData{
"1.1.1.1": nd, "2.2.2.2": nd2, "3.3.3.3": nd3,
})
results := CheckIPFS(data)
// Node 1.1.1.1 should warn (1 < 2)
found := false
for _, r := range results {
if r.ID == "ipfs.swarm_peers" && r.Node == "ubuntu@1.1.1.1" && r.Status == inspector.StatusWarn {
found = true
}
}
if !found {
t.Error("expected swarm_peers warn for node 1.1.1.1")
}
})
t.Run("zero isolated", func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 0}
nd2 := makeNodeData("2.2.2.2", "node")
nd2.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 1}
data := makeCluster(map[string]*inspector.NodeData{
"1.1.1.1": nd, "2.2.2.2": nd2,
})
results := CheckIPFS(data)
found := false
for _, r := range results {
if r.ID == "ipfs.swarm_peers" && r.Node == "ubuntu@1.1.1.1" && r.Status == inspector.StatusFail {
found = true
}
}
if !found {
t.Error("expected swarm_peers fail for isolated node 1.1.1.1")
}
})
}
func TestCheckIPFS_RepoSize(t *testing.T) {
tests := []struct {
name string
size int64
max int64
status inspector.Status
}{
{"healthy", 500 * 1024 * 1024, 1024 * 1024 * 1024, inspector.StatusPass}, // 50%
{"elevated", 870 * 1024 * 1024, 1024 * 1024 * 1024, inspector.StatusWarn}, // 85%
{"nearly full", 980 * 1024 * 1024, 1024 * 1024 * 1024, inspector.StatusFail}, // 96%
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.IPFS = &inspector.IPFSData{
DaemonActive: true,
RepoSizeBytes: tt.size,
RepoMaxBytes: tt.max,
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckIPFS(data)
expectStatus(t, results, "ipfs.repo_size", tt.status)
})
}
}
func TestCheckIPFS_SwarmKeyMissing(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.IPFS = &inspector.IPFSData{DaemonActive: true, HasSwarmKey: false}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckIPFS(data)
expectStatus(t, results, "ipfs.swarm_key", inspector.StatusFail)
}
func TestCheckIPFS_BootstrapNotEmpty(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.IPFS = &inspector.IPFSData{DaemonActive: true, BootstrapEmpty: false}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckIPFS(data)
expectStatus(t, results, "ipfs.bootstrap_empty", inspector.StatusWarn)
}
func TestCheckIPFS_CrossNode_VersionConsistency(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
for _, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
nd := makeNodeData(host, "node")
nd.IPFS = &inspector.IPFSData{DaemonActive: true, KuboVersion: "0.22.0", ClusterVersion: "1.0.8"}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckIPFS(data)
expectStatus(t, results, "ipfs.kubo_version_consistent", inspector.StatusPass)
expectStatus(t, results, "ipfs.cluster_version_consistent", inspector.StatusPass)
}
func TestCheckIPFS_CrossNode_VersionMismatch(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
versions := []string{"0.22.0", "0.22.0", "0.21.0"}
for i, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
nd := makeNodeData(host, "node")
nd.IPFS = &inspector.IPFSData{DaemonActive: true, KuboVersion: versions[i]}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckIPFS(data)
expectStatus(t, results, "ipfs.kubo_version_consistent", inspector.StatusWarn)
}
func TestCheckIPFS_NilData(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckIPFS(data)
if len(results) != 0 {
t.Errorf("expected 0 results for nil IPFS data, got %d", len(results))
}
}

View File

@ -0,0 +1,155 @@
package checks
import (
"fmt"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func init() {
inspector.RegisterChecker("namespace", CheckNamespace)
}
const nsSub = "namespace"
// CheckNamespace runs all namespace-level health checks.
func CheckNamespace(data *inspector.ClusterData) []inspector.CheckResult {
var results []inspector.CheckResult
for _, nd := range data.Nodes {
if len(nd.Namespaces) == 0 {
continue
}
results = append(results, checkNamespacesPerNode(nd)...)
}
results = append(results, checkNamespacesCrossNode(data)...)
return results
}
func checkNamespacesPerNode(nd *inspector.NodeData) []inspector.CheckResult {
var r []inspector.CheckResult
node := nd.Node.Name()
for _, ns := range nd.Namespaces {
prefix := fmt.Sprintf("ns.%s", ns.Name)
// RQLite health
if ns.RQLiteUp {
r = append(r, inspector.Pass(prefix+".rqlite_up", fmt.Sprintf("Namespace %s RQLite responding", ns.Name), nsSub, node,
fmt.Sprintf("port_base=%d state=%s", ns.PortBase, ns.RQLiteState), inspector.Critical))
} else {
r = append(r, inspector.Fail(prefix+".rqlite_up", fmt.Sprintf("Namespace %s RQLite responding", ns.Name), nsSub, node,
fmt.Sprintf("port_base=%d not responding", ns.PortBase), inspector.Critical))
}
// RQLite Raft state
if ns.RQLiteUp {
switch ns.RQLiteState {
case "Leader", "Follower":
r = append(r, inspector.Pass(prefix+".rqlite_state", fmt.Sprintf("Namespace %s RQLite raft state valid", ns.Name), nsSub, node,
fmt.Sprintf("state=%s", ns.RQLiteState), inspector.Critical))
case "Candidate":
r = append(r, inspector.Warn(prefix+".rqlite_state", fmt.Sprintf("Namespace %s RQLite raft state valid", ns.Name), nsSub, node,
"state=Candidate (election in progress)", inspector.Critical))
default:
r = append(r, inspector.Fail(prefix+".rqlite_state", fmt.Sprintf("Namespace %s RQLite raft state valid", ns.Name), nsSub, node,
fmt.Sprintf("state=%s", ns.RQLiteState), inspector.Critical))
}
}
// RQLite readiness
if ns.RQLiteReady {
r = append(r, inspector.Pass(prefix+".rqlite_ready", fmt.Sprintf("Namespace %s RQLite ready", ns.Name), nsSub, node,
"/readyz OK", inspector.Critical))
} else if ns.RQLiteUp {
r = append(r, inspector.Fail(prefix+".rqlite_ready", fmt.Sprintf("Namespace %s RQLite ready", ns.Name), nsSub, node,
"/readyz failed", inspector.Critical))
}
// Olric health
if ns.OlricUp {
r = append(r, inspector.Pass(prefix+".olric_up", fmt.Sprintf("Namespace %s Olric port listening", ns.Name), nsSub, node,
"memberlist port bound", inspector.High))
} else {
r = append(r, inspector.Fail(prefix+".olric_up", fmt.Sprintf("Namespace %s Olric port listening", ns.Name), nsSub, node,
"memberlist port not bound", inspector.High))
}
// Gateway health
if ns.GatewayUp {
r = append(r, inspector.Pass(prefix+".gateway_up", fmt.Sprintf("Namespace %s Gateway responding", ns.Name), nsSub, node,
fmt.Sprintf("HTTP status=%d", ns.GatewayStatus), inspector.High))
} else {
r = append(r, inspector.Fail(prefix+".gateway_up", fmt.Sprintf("Namespace %s Gateway responding", ns.Name), nsSub, node,
fmt.Sprintf("HTTP status=%d", ns.GatewayStatus), inspector.High))
}
}
return r
}
func checkNamespacesCrossNode(data *inspector.ClusterData) []inspector.CheckResult {
var r []inspector.CheckResult
// Collect all namespace names across nodes
nsNodes := map[string]int{} // namespace name → count of nodes running it
nsHealthy := map[string]int{} // namespace name → count of nodes where all services are up
for _, nd := range data.Nodes {
for _, ns := range nd.Namespaces {
nsNodes[ns.Name]++
if ns.RQLiteUp && ns.OlricUp && ns.GatewayUp {
nsHealthy[ns.Name]++
}
}
}
for name, total := range nsNodes {
healthy := nsHealthy[name]
if healthy == total {
r = append(r, inspector.Pass(
fmt.Sprintf("ns.%s.all_healthy", name),
fmt.Sprintf("Namespace %s healthy on all nodes", name),
nsSub, "",
fmt.Sprintf("%d/%d nodes fully healthy", healthy, total),
inspector.Critical))
} else {
r = append(r, inspector.Fail(
fmt.Sprintf("ns.%s.all_healthy", name),
fmt.Sprintf("Namespace %s healthy on all nodes", name),
nsSub, "",
fmt.Sprintf("%d/%d nodes fully healthy", healthy, total),
inspector.Critical))
}
// Check namespace has quorum (>= N/2+1 RQLite instances)
rqliteUp := 0
for _, nd := range data.Nodes {
for _, ns := range nd.Namespaces {
if ns.Name == name && ns.RQLiteUp {
rqliteUp++
}
}
}
quorumNeeded := total/2 + 1
if rqliteUp >= quorumNeeded {
r = append(r, inspector.Pass(
fmt.Sprintf("ns.%s.quorum", name),
fmt.Sprintf("Namespace %s RQLite quorum", name),
nsSub, "",
fmt.Sprintf("rqlite_up=%d/%d quorum_needed=%d", rqliteUp, total, quorumNeeded),
inspector.Critical))
} else {
r = append(r, inspector.Fail(
fmt.Sprintf("ns.%s.quorum", name),
fmt.Sprintf("Namespace %s RQLite quorum", name),
nsSub, "",
fmt.Sprintf("rqlite_up=%d/%d quorum_needed=%d (QUORUM LOST)", rqliteUp, total, quorumNeeded),
inspector.Critical))
}
}
return r
}

View File

@ -0,0 +1,165 @@
package checks
import (
"testing"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func TestCheckNamespace_PerNodeHealthy(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Namespaces = []inspector.NamespaceData{
{
Name: "myapp",
PortBase: 10000,
RQLiteUp: true,
RQLiteState: "Leader",
RQLiteReady: true,
OlricUp: true,
GatewayUp: true,
GatewayStatus: 200,
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNamespace(data)
expectStatus(t, results, "ns.myapp.rqlite_up", inspector.StatusPass)
expectStatus(t, results, "ns.myapp.rqlite_state", inspector.StatusPass)
expectStatus(t, results, "ns.myapp.rqlite_ready", inspector.StatusPass)
expectStatus(t, results, "ns.myapp.olric_up", inspector.StatusPass)
expectStatus(t, results, "ns.myapp.gateway_up", inspector.StatusPass)
}
func TestCheckNamespace_RQLiteDown(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Namespaces = []inspector.NamespaceData{
{Name: "myapp", PortBase: 10000, RQLiteUp: false},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNamespace(data)
expectStatus(t, results, "ns.myapp.rqlite_up", inspector.StatusFail)
}
func TestCheckNamespace_RQLiteStates(t *testing.T) {
tests := []struct {
state string
status inspector.Status
}{
{"Leader", inspector.StatusPass},
{"Follower", inspector.StatusPass},
{"Candidate", inspector.StatusWarn},
{"Unknown", inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.state, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Namespaces = []inspector.NamespaceData{
{Name: "myapp", PortBase: 10000, RQLiteUp: true, RQLiteState: tt.state},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNamespace(data)
expectStatus(t, results, "ns.myapp.rqlite_state", tt.status)
})
}
}
func TestCheckNamespace_RQLiteNotReady(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Namespaces = []inspector.NamespaceData{
{Name: "myapp", PortBase: 10000, RQLiteUp: true, RQLiteState: "Follower", RQLiteReady: false},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNamespace(data)
expectStatus(t, results, "ns.myapp.rqlite_ready", inspector.StatusFail)
}
func TestCheckNamespace_OlricDown(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Namespaces = []inspector.NamespaceData{
{Name: "myapp", OlricUp: false},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNamespace(data)
expectStatus(t, results, "ns.myapp.olric_up", inspector.StatusFail)
}
func TestCheckNamespace_GatewayDown(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Namespaces = []inspector.NamespaceData{
{Name: "myapp", GatewayUp: false, GatewayStatus: 0},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNamespace(data)
expectStatus(t, results, "ns.myapp.gateway_up", inspector.StatusFail)
}
func TestCheckNamespace_CrossNode_AllHealthy(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
for _, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
nd := makeNodeData(host, "node")
nd.Namespaces = []inspector.NamespaceData{
{Name: "myapp", RQLiteUp: true, OlricUp: true, GatewayUp: true},
}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckNamespace(data)
expectStatus(t, results, "ns.myapp.all_healthy", inspector.StatusPass)
expectStatus(t, results, "ns.myapp.quorum", inspector.StatusPass)
}
func TestCheckNamespace_CrossNode_PartialHealthy(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
for i, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
nd := makeNodeData(host, "node")
nd.Namespaces = []inspector.NamespaceData{
{Name: "myapp", RQLiteUp: true, OlricUp: i < 2, GatewayUp: true},
}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckNamespace(data)
expectStatus(t, results, "ns.myapp.all_healthy", inspector.StatusFail)
// Quorum should still pass (3/3 RQLite up, need 2)
expectStatus(t, results, "ns.myapp.quorum", inspector.StatusPass)
}
func TestCheckNamespace_CrossNode_QuorumLost(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
rqliteUp := []bool{true, false, false}
for i, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
nd := makeNodeData(host, "node")
nd.Namespaces = []inspector.NamespaceData{
{Name: "myapp", RQLiteUp: rqliteUp[i], OlricUp: true, GatewayUp: true},
}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckNamespace(data)
expectStatus(t, results, "ns.myapp.quorum", inspector.StatusFail)
}
func TestCheckNamespace_MultipleNamespaces(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Namespaces = []inspector.NamespaceData{
{Name: "app1", RQLiteUp: true, RQLiteState: "Leader", OlricUp: true, GatewayUp: true},
{Name: "app2", RQLiteUp: false, OlricUp: true, GatewayUp: true},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNamespace(data)
expectStatus(t, results, "ns.app1.rqlite_up", inspector.StatusPass)
expectStatus(t, results, "ns.app2.rqlite_up", inspector.StatusFail)
}
func TestCheckNamespace_NoNamespaces(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Namespaces = nil
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNamespace(data)
// No per-node results, only cross-node (which should be empty since no namespaces)
for _, r := range results {
t.Errorf("unexpected check: %s", r.ID)
}
}

View File

@ -0,0 +1,113 @@
package checks
import (
"fmt"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func init() {
inspector.RegisterChecker("network", CheckNetwork)
}
const networkSub = "network"
// CheckNetwork runs all network-level health checks.
func CheckNetwork(data *inspector.ClusterData) []inspector.CheckResult {
var results []inspector.CheckResult
for _, nd := range data.Nodes {
if nd.Network == nil {
continue
}
results = append(results, checkNetworkPerNode(nd)...)
}
return results
}
func checkNetworkPerNode(nd *inspector.NodeData) []inspector.CheckResult {
var r []inspector.CheckResult
net := nd.Network
node := nd.Node.Name()
// 7.2 Internet connectivity
if net.InternetReachable {
r = append(r, inspector.Pass("network.internet", "Internet reachable (ping 8.8.8.8)", networkSub, node,
"ping 8.8.8.8 succeeded", inspector.High))
} else {
r = append(r, inspector.Fail("network.internet", "Internet reachable (ping 8.8.8.8)", networkSub, node,
"ping 8.8.8.8 failed", inspector.High))
}
// 7.14 Default route
if net.DefaultRoute {
r = append(r, inspector.Pass("network.default_route", "Default route exists", networkSub, node,
"default route present", inspector.Critical))
} else {
r = append(r, inspector.Fail("network.default_route", "Default route exists", networkSub, node,
"no default route", inspector.Critical))
}
// 7.15 WG subnet route
if net.WGRouteExists {
r = append(r, inspector.Pass("network.wg_route", "WG subnet route exists", networkSub, node,
"10.0.0.0/24 via wg0 present", inspector.Critical))
} else {
r = append(r, inspector.Fail("network.wg_route", "WG subnet route exists", networkSub, node,
"10.0.0.0/24 route via wg0 NOT found", inspector.Critical))
}
// 7.4 TCP connections
if net.TCPEstablished > 0 {
if net.TCPEstablished < 5000 {
r = append(r, inspector.Pass("network.tcp_established", "TCP connections reasonable", networkSub, node,
fmt.Sprintf("established=%d", net.TCPEstablished), inspector.Medium))
} else {
r = append(r, inspector.Warn("network.tcp_established", "TCP connections reasonable", networkSub, node,
fmt.Sprintf("established=%d (high)", net.TCPEstablished), inspector.Medium))
}
}
// 7.6 TIME_WAIT
if net.TCPTimeWait < 10000 {
r = append(r, inspector.Pass("network.tcp_timewait", "TIME_WAIT count low", networkSub, node,
fmt.Sprintf("timewait=%d", net.TCPTimeWait), inspector.Medium))
} else {
r = append(r, inspector.Warn("network.tcp_timewait", "TIME_WAIT count low", networkSub, node,
fmt.Sprintf("timewait=%d (accumulating)", net.TCPTimeWait), inspector.Medium))
}
// 7.8 TCP retransmission rate
if net.TCPRetransRate >= 0 {
if net.TCPRetransRate < 1 {
r = append(r, inspector.Pass("network.tcp_retrans", "TCP retransmission rate low", networkSub, node,
fmt.Sprintf("retrans=%.2f%%", net.TCPRetransRate), inspector.Medium))
} else if net.TCPRetransRate < 5 {
r = append(r, inspector.Warn("network.tcp_retrans", "TCP retransmission rate low", networkSub, node,
fmt.Sprintf("retrans=%.2f%% (elevated)", net.TCPRetransRate), inspector.Medium))
} else {
r = append(r, inspector.Fail("network.tcp_retrans", "TCP retransmission rate low", networkSub, node,
fmt.Sprintf("retrans=%.2f%% (high packet loss)", net.TCPRetransRate), inspector.High))
}
}
// 7.10 WG mesh peer pings (NxN connectivity)
if len(net.PingResults) > 0 {
failCount := 0
for _, ok := range net.PingResults {
if !ok {
failCount++
}
}
if failCount == 0 {
r = append(r, inspector.Pass("network.wg_mesh_ping", "All WG peers reachable via ping", networkSub, node,
fmt.Sprintf("%d/%d peers pingable", len(net.PingResults), len(net.PingResults)), inspector.Critical))
} else {
r = append(r, inspector.Fail("network.wg_mesh_ping", "All WG peers reachable via ping", networkSub, node,
fmt.Sprintf("%d/%d peers unreachable", failCount, len(net.PingResults)), inspector.Critical))
}
}
return r
}

View File

@ -0,0 +1,151 @@
package checks
import (
"testing"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func TestCheckNetwork_HealthyNode(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Network = &inspector.NetworkData{
InternetReachable: true,
DefaultRoute: true,
WGRouteExists: true,
TCPEstablished: 100,
TCPTimeWait: 50,
TCPRetransRate: 0.1,
PingResults: map[string]bool{"10.0.0.2": true, "10.0.0.3": true},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNetwork(data)
expectStatus(t, results, "network.internet", inspector.StatusPass)
expectStatus(t, results, "network.default_route", inspector.StatusPass)
expectStatus(t, results, "network.wg_route", inspector.StatusPass)
expectStatus(t, results, "network.tcp_established", inspector.StatusPass)
expectStatus(t, results, "network.tcp_timewait", inspector.StatusPass)
expectStatus(t, results, "network.tcp_retrans", inspector.StatusPass)
expectStatus(t, results, "network.wg_mesh_ping", inspector.StatusPass)
}
func TestCheckNetwork_InternetUnreachable(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Network = &inspector.NetworkData{InternetReachable: false}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNetwork(data)
expectStatus(t, results, "network.internet", inspector.StatusFail)
}
func TestCheckNetwork_MissingRoutes(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Network = &inspector.NetworkData{DefaultRoute: false, WGRouteExists: false}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNetwork(data)
expectStatus(t, results, "network.default_route", inspector.StatusFail)
expectStatus(t, results, "network.wg_route", inspector.StatusFail)
}
func TestCheckNetwork_TCPConnections(t *testing.T) {
tests := []struct {
name string
estab int
status inspector.Status
}{
{"normal", 100, inspector.StatusPass},
{"high", 6000, inspector.StatusWarn},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Network = &inspector.NetworkData{TCPEstablished: tt.estab}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNetwork(data)
expectStatus(t, results, "network.tcp_established", tt.status)
})
}
}
func TestCheckNetwork_TCPTimeWait(t *testing.T) {
tests := []struct {
name string
tw int
status inspector.Status
}{
{"normal", 50, inspector.StatusPass},
{"high", 15000, inspector.StatusWarn},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Network = &inspector.NetworkData{TCPTimeWait: tt.tw}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNetwork(data)
expectStatus(t, results, "network.tcp_timewait", tt.status)
})
}
}
func TestCheckNetwork_TCPRetransmission(t *testing.T) {
tests := []struct {
name string
rate float64
status inspector.Status
}{
{"low", 0.1, inspector.StatusPass},
{"elevated", 3.0, inspector.StatusWarn},
{"high", 8.0, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Network = &inspector.NetworkData{TCPRetransRate: tt.rate}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNetwork(data)
expectStatus(t, results, "network.tcp_retrans", tt.status)
})
}
}
func TestCheckNetwork_WGMeshPing(t *testing.T) {
t.Run("all ok", func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Network = &inspector.NetworkData{
PingResults: map[string]bool{"10.0.0.2": true, "10.0.0.3": true},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNetwork(data)
expectStatus(t, results, "network.wg_mesh_ping", inspector.StatusPass)
})
t.Run("some fail", func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Network = &inspector.NetworkData{
PingResults: map[string]bool{"10.0.0.2": true, "10.0.0.3": false},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNetwork(data)
expectStatus(t, results, "network.wg_mesh_ping", inspector.StatusFail)
})
t.Run("no pings", func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Network = &inspector.NetworkData{PingResults: map[string]bool{}}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNetwork(data)
// No ping results → no wg_mesh_ping check
if findCheck(results, "network.wg_mesh_ping") != nil {
t.Error("should not emit wg_mesh_ping when no ping results")
}
})
}
func TestCheckNetwork_NilData(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckNetwork(data)
if len(results) != 0 {
t.Errorf("expected 0 results for nil Network data, got %d", len(results))
}
}

View File

@ -0,0 +1,157 @@
package checks
import (
"fmt"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func init() {
inspector.RegisterChecker("olric", CheckOlric)
}
const olricSub = "olric"
// CheckOlric runs all Olric health checks against cluster data.
func CheckOlric(data *inspector.ClusterData) []inspector.CheckResult {
var results []inspector.CheckResult
for _, nd := range data.Nodes {
if nd.Olric == nil {
continue
}
results = append(results, checkOlricPerNode(nd)...)
}
results = append(results, checkOlricCrossNode(data)...)
return results
}
func checkOlricPerNode(nd *inspector.NodeData) []inspector.CheckResult {
var r []inspector.CheckResult
ol := nd.Olric
node := nd.Node.Name()
// 2.1 Service active
if ol.ServiceActive {
r = append(r, inspector.Pass("olric.service_active", "Olric service active", olricSub, node,
"debros-olric is active", inspector.Critical))
} else {
r = append(r, inspector.Fail("olric.service_active", "Olric service active", olricSub, node,
"debros-olric is not active", inspector.Critical))
return r
}
// 2.7 Memberlist port accepting connections
if ol.MemberlistUp {
r = append(r, inspector.Pass("olric.memberlist_port", "Memberlist port 3322 listening", olricSub, node,
"TCP 3322 is bound", inspector.Critical))
} else {
r = append(r, inspector.Fail("olric.memberlist_port", "Memberlist port 3322 listening", olricSub, node,
"TCP 3322 is not listening", inspector.Critical))
}
// 2.3 Restart count
if ol.RestartCount == 0 {
r = append(r, inspector.Pass("olric.restarts", "Low restart count", olricSub, node,
"NRestarts=0", inspector.High))
} else if ol.RestartCount <= 3 {
r = append(r, inspector.Warn("olric.restarts", "Low restart count", olricSub, node,
fmt.Sprintf("NRestarts=%d", ol.RestartCount), inspector.High))
} else {
r = append(r, inspector.Fail("olric.restarts", "Low restart count", olricSub, node,
fmt.Sprintf("NRestarts=%d (crash-looping?)", ol.RestartCount), inspector.High))
}
// 2.4 Process memory
if ol.ProcessMemMB > 0 {
if ol.ProcessMemMB < 200 {
r = append(r, inspector.Pass("olric.memory", "Process memory healthy", olricSub, node,
fmt.Sprintf("RSS=%dMB", ol.ProcessMemMB), inspector.Medium))
} else if ol.ProcessMemMB < 500 {
r = append(r, inspector.Warn("olric.memory", "Process memory healthy", olricSub, node,
fmt.Sprintf("RSS=%dMB (elevated)", ol.ProcessMemMB), inspector.Medium))
} else {
r = append(r, inspector.Fail("olric.memory", "Process memory healthy", olricSub, node,
fmt.Sprintf("RSS=%dMB (high)", ol.ProcessMemMB), inspector.High))
}
}
// 2.9-2.11 Log analysis: suspects
if ol.LogSuspects == 0 {
r = append(r, inspector.Pass("olric.log_suspects", "No suspect/failed members in logs", olricSub, node,
"no suspect messages in last hour", inspector.Critical))
} else {
r = append(r, inspector.Fail("olric.log_suspects", "No suspect/failed members in logs", olricSub, node,
fmt.Sprintf("%d suspect/failed messages in last hour", ol.LogSuspects), inspector.Critical))
}
// 2.13 Flapping detection
if ol.LogFlapping < 5 {
r = append(r, inspector.Pass("olric.log_flapping", "No rapid join/leave cycles", olricSub, node,
fmt.Sprintf("join/leave events=%d in last hour", ol.LogFlapping), inspector.High))
} else {
r = append(r, inspector.Warn("olric.log_flapping", "No rapid join/leave cycles", olricSub, node,
fmt.Sprintf("join/leave events=%d in last hour (flapping?)", ol.LogFlapping), inspector.High))
}
// 2.39 Log error rate
if ol.LogErrors < 5 {
r = append(r, inspector.Pass("olric.log_errors", "Log error rate low", olricSub, node,
fmt.Sprintf("errors=%d in last hour", ol.LogErrors), inspector.High))
} else if ol.LogErrors < 20 {
r = append(r, inspector.Warn("olric.log_errors", "Log error rate low", olricSub, node,
fmt.Sprintf("errors=%d in last hour", ol.LogErrors), inspector.High))
} else {
r = append(r, inspector.Fail("olric.log_errors", "Log error rate low", olricSub, node,
fmt.Sprintf("errors=%d in last hour (high)", ol.LogErrors), inspector.High))
}
return r
}
func checkOlricCrossNode(data *inspector.ClusterData) []inspector.CheckResult {
var r []inspector.CheckResult
activeCount := 0
memberlistCount := 0
totalNodes := 0
for _, nd := range data.Nodes {
if nd.Olric == nil {
continue
}
totalNodes++
if nd.Olric.ServiceActive {
activeCount++
}
if nd.Olric.MemberlistUp {
memberlistCount++
}
}
if totalNodes < 2 {
return r
}
// All nodes have Olric running
if activeCount == totalNodes {
r = append(r, inspector.Pass("olric.all_active", "All nodes running Olric", olricSub, "",
fmt.Sprintf("%d/%d nodes active", activeCount, totalNodes), inspector.Critical))
} else {
r = append(r, inspector.Fail("olric.all_active", "All nodes running Olric", olricSub, "",
fmt.Sprintf("%d/%d nodes active", activeCount, totalNodes), inspector.Critical))
}
// All memberlist ports up
if memberlistCount == totalNodes {
r = append(r, inspector.Pass("olric.all_memberlist", "All memberlist ports listening", olricSub, "",
fmt.Sprintf("%d/%d nodes with memberlist", memberlistCount, totalNodes), inspector.High))
} else {
r = append(r, inspector.Warn("olric.all_memberlist", "All memberlist ports listening", olricSub, "",
fmt.Sprintf("%d/%d nodes with memberlist", memberlistCount, totalNodes), inspector.High))
}
return r
}

View File

@ -0,0 +1,149 @@
package checks
import (
"testing"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func TestCheckOlric_ServiceInactive(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Olric = &inspector.OlricData{ServiceActive: false}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckOlric(data)
expectStatus(t, results, "olric.service_active", inspector.StatusFail)
// Should return early — no further per-node checks
if findCheck(results, "olric.memberlist_port") != nil {
t.Error("should not check memberlist when service inactive")
}
}
func TestCheckOlric_HealthyNode(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Olric = &inspector.OlricData{
ServiceActive: true,
MemberlistUp: true,
RestartCount: 0,
ProcessMemMB: 100,
LogSuspects: 0,
LogFlapping: 0,
LogErrors: 0,
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckOlric(data)
expectStatus(t, results, "olric.service_active", inspector.StatusPass)
expectStatus(t, results, "olric.memberlist_port", inspector.StatusPass)
expectStatus(t, results, "olric.restarts", inspector.StatusPass)
expectStatus(t, results, "olric.log_suspects", inspector.StatusPass)
expectStatus(t, results, "olric.log_flapping", inspector.StatusPass)
expectStatus(t, results, "olric.log_errors", inspector.StatusPass)
}
func TestCheckOlric_RestartCounts(t *testing.T) {
tests := []struct {
name string
restarts int
status inspector.Status
}{
{"zero", 0, inspector.StatusPass},
{"few", 2, inspector.StatusWarn},
{"many", 5, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Olric = &inspector.OlricData{ServiceActive: true, RestartCount: tt.restarts}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckOlric(data)
expectStatus(t, results, "olric.restarts", tt.status)
})
}
}
func TestCheckOlric_Memory(t *testing.T) {
tests := []struct {
name string
memMB int
status inspector.Status
}{
{"healthy", 100, inspector.StatusPass},
{"elevated", 300, inspector.StatusWarn},
{"high", 600, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Olric = &inspector.OlricData{ServiceActive: true, ProcessMemMB: tt.memMB}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckOlric(data)
expectStatus(t, results, "olric.memory", tt.status)
})
}
}
func TestCheckOlric_LogSuspects(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Olric = &inspector.OlricData{ServiceActive: true, LogSuspects: 5}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckOlric(data)
expectStatus(t, results, "olric.log_suspects", inspector.StatusFail)
}
func TestCheckOlric_LogErrors(t *testing.T) {
tests := []struct {
name string
errors int
status inspector.Status
}{
{"none", 0, inspector.StatusPass},
{"few", 10, inspector.StatusWarn},
{"many", 30, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.Olric = &inspector.OlricData{ServiceActive: true, LogErrors: tt.errors}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckOlric(data)
expectStatus(t, results, "olric.log_errors", tt.status)
})
}
}
func TestCheckOlric_CrossNode_AllActive(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
for _, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
nd := makeNodeData(host, "node")
nd.Olric = &inspector.OlricData{ServiceActive: true, MemberlistUp: true}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckOlric(data)
expectStatus(t, results, "olric.all_active", inspector.StatusPass)
expectStatus(t, results, "olric.all_memberlist", inspector.StatusPass)
}
func TestCheckOlric_CrossNode_PartialActive(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
for i, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
nd := makeNodeData(host, "node")
nd.Olric = &inspector.OlricData{ServiceActive: i < 2, MemberlistUp: i < 2}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckOlric(data)
expectStatus(t, results, "olric.all_active", inspector.StatusFail)
}
func TestCheckOlric_NilData(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckOlric(data)
if len(results) != 0 {
t.Errorf("expected 0 results for nil Olric data, got %d", len(results))
}
}

View File

@ -0,0 +1,533 @@
package checks
import (
"fmt"
"math"
"strings"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func init() {
inspector.RegisterChecker("rqlite", CheckRQLite)
}
const rqliteSub = "rqlite"
// CheckRQLite runs all RQLite health checks against cluster data.
func CheckRQLite(data *inspector.ClusterData) []inspector.CheckResult {
var results []inspector.CheckResult
// Per-node checks
for _, nd := range data.Nodes {
if nd.RQLite == nil {
continue
}
results = append(results, checkRQLitePerNode(nd, data)...)
}
// Cross-node checks
results = append(results, checkRQLiteCrossNode(data)...)
return results
}
func checkRQLitePerNode(nd *inspector.NodeData, data *inspector.ClusterData) []inspector.CheckResult {
var r []inspector.CheckResult
rq := nd.RQLite
node := nd.Node.Name()
// 1.2 HTTP endpoint responsive
if !rq.Responsive {
r = append(r, inspector.Fail("rqlite.responsive", "RQLite HTTP endpoint responsive", rqliteSub, node,
"curl localhost:5001/status failed or returned error", inspector.Critical))
return r
}
r = append(r, inspector.Pass("rqlite.responsive", "RQLite HTTP endpoint responsive", rqliteSub, node,
"responding on port 5001", inspector.Critical))
// 1.3 Full readiness (/readyz)
if rq.Readyz != nil {
if rq.Readyz.Ready {
r = append(r, inspector.Pass("rqlite.readyz", "Full readiness check", rqliteSub, node,
"node, leader, store all ready", inspector.Critical))
} else {
var parts []string
if rq.Readyz.Node != "ready" {
parts = append(parts, "node: "+rq.Readyz.Node)
}
if rq.Readyz.Leader != "ready" {
parts = append(parts, "leader: "+rq.Readyz.Leader)
}
if rq.Readyz.Store != "ready" {
parts = append(parts, "store: "+rq.Readyz.Store)
}
r = append(r, inspector.Fail("rqlite.readyz", "Full readiness check", rqliteSub, node,
"not ready: "+strings.Join(parts, ", "), inspector.Critical))
}
}
s := rq.Status
if s == nil {
r = append(r, inspector.Skip("rqlite.status_parsed", "Status JSON parseable", rqliteSub, node,
"could not parse /status response", inspector.Critical))
return r
}
// 1.5 Raft state valid
switch s.RaftState {
case "Leader", "Follower":
r = append(r, inspector.Pass("rqlite.raft_state", "Raft state valid", rqliteSub, node,
fmt.Sprintf("state=%s", s.RaftState), inspector.Critical))
case "Candidate":
r = append(r, inspector.Warn("rqlite.raft_state", "Raft state valid", rqliteSub, node,
"state=Candidate (election in progress)", inspector.Critical))
case "Shutdown":
r = append(r, inspector.Fail("rqlite.raft_state", "Raft state valid", rqliteSub, node,
"state=Shutdown", inspector.Critical))
default:
r = append(r, inspector.Fail("rqlite.raft_state", "Raft state valid", rqliteSub, node,
fmt.Sprintf("unexpected state=%q", s.RaftState), inspector.Critical))
}
// 1.7 Leader identity known
if s.LeaderNodeID == "" {
r = append(r, inspector.Fail("rqlite.leader_known", "Leader identity known", rqliteSub, node,
"leader node_id is empty", inspector.Critical))
} else {
r = append(r, inspector.Pass("rqlite.leader_known", "Leader identity known", rqliteSub, node,
fmt.Sprintf("leader=%s", s.LeaderNodeID), inspector.Critical))
}
// 1.8 Voter status
if s.Voter {
r = append(r, inspector.Pass("rqlite.voter", "Node is voter", rqliteSub, node,
"voter=true", inspector.Low))
} else {
r = append(r, inspector.Warn("rqlite.voter", "Node is voter", rqliteSub, node,
"voter=false (non-voter)", inspector.Low))
}
// 1.9 Num peers — use the node's own /nodes endpoint to determine cluster size
// (not config file, since not all config nodes are necessarily in the Raft cluster)
if rq.Nodes != nil && len(rq.Nodes) > 0 {
expectedPeers := len(rq.Nodes) - 1 // cluster members minus self
if expectedPeers < 0 {
expectedPeers = 0
}
if s.NumPeers == expectedPeers {
r = append(r, inspector.Pass("rqlite.num_peers", "Peer count matches cluster size", rqliteSub, node,
fmt.Sprintf("peers=%d (cluster has %d nodes)", s.NumPeers, len(rq.Nodes)), inspector.Critical))
} else {
r = append(r, inspector.Warn("rqlite.num_peers", "Peer count matches cluster size", rqliteSub, node,
fmt.Sprintf("peers=%d but /nodes reports %d members", s.NumPeers, len(rq.Nodes)), inspector.High))
}
} else {
r = append(r, inspector.Pass("rqlite.num_peers", "Peer count reported", rqliteSub, node,
fmt.Sprintf("peers=%d", s.NumPeers), inspector.Medium))
}
// 1.11 Commit index vs applied index
if s.CommitIndex > 0 && s.AppliedIndex > 0 {
gap := s.CommitIndex - s.AppliedIndex
if s.AppliedIndex > s.CommitIndex {
gap = 0
}
if gap <= 2 {
r = append(r, inspector.Pass("rqlite.commit_applied_gap", "Commit/applied index close", rqliteSub, node,
fmt.Sprintf("commit=%d applied=%d gap=%d", s.CommitIndex, s.AppliedIndex, gap), inspector.Critical))
} else if gap <= 100 {
r = append(r, inspector.Warn("rqlite.commit_applied_gap", "Commit/applied index close", rqliteSub, node,
fmt.Sprintf("commit=%d applied=%d gap=%d (lagging)", s.CommitIndex, s.AppliedIndex, gap), inspector.Critical))
} else {
r = append(r, inspector.Fail("rqlite.commit_applied_gap", "Commit/applied index close", rqliteSub, node,
fmt.Sprintf("commit=%d applied=%d gap=%d (severely behind)", s.CommitIndex, s.AppliedIndex, gap), inspector.Critical))
}
}
// 1.12 FSM pending
if s.FsmPending == 0 {
r = append(r, inspector.Pass("rqlite.fsm_pending", "FSM pending queue empty", rqliteSub, node,
"fsm_pending=0", inspector.High))
} else if s.FsmPending <= 10 {
r = append(r, inspector.Warn("rqlite.fsm_pending", "FSM pending queue empty", rqliteSub, node,
fmt.Sprintf("fsm_pending=%d", s.FsmPending), inspector.High))
} else {
r = append(r, inspector.Fail("rqlite.fsm_pending", "FSM pending queue empty", rqliteSub, node,
fmt.Sprintf("fsm_pending=%d (backlog)", s.FsmPending), inspector.High))
}
// 1.13 Last contact (followers only)
if s.RaftState == "Follower" && s.LastContact != "" {
r = append(r, inspector.Pass("rqlite.last_contact", "Follower last contact recent", rqliteSub, node,
fmt.Sprintf("last_contact=%s", s.LastContact), inspector.Critical))
}
// 1.14 Last log term matches current term
if s.LastLogTerm > 0 && s.Term > 0 {
if s.LastLogTerm == s.Term {
r = append(r, inspector.Pass("rqlite.log_term_match", "Last log term matches current", rqliteSub, node,
fmt.Sprintf("term=%d last_log_term=%d", s.Term, s.LastLogTerm), inspector.Medium))
} else {
r = append(r, inspector.Warn("rqlite.log_term_match", "Last log term matches current", rqliteSub, node,
fmt.Sprintf("term=%d last_log_term=%d (mismatch)", s.Term, s.LastLogTerm), inspector.Medium))
}
}
// 1.15 db_applied_index == fsm_index
if s.DBAppliedIndex > 0 && s.FsmIndex > 0 {
if s.DBAppliedIndex == s.FsmIndex {
r = append(r, inspector.Pass("rqlite.db_fsm_sync", "DB applied index matches FSM index", rqliteSub, node,
fmt.Sprintf("db_applied=%d fsm=%d", s.DBAppliedIndex, s.FsmIndex), inspector.Critical))
} else {
r = append(r, inspector.Fail("rqlite.db_fsm_sync", "DB applied index matches FSM index", rqliteSub, node,
fmt.Sprintf("db_applied=%d fsm=%d (diverged)", s.DBAppliedIndex, s.FsmIndex), inspector.Critical))
}
}
// 1.18 Last snapshot index close to applied
if s.LastSnapshot > 0 && s.AppliedIndex > 0 {
gap := s.AppliedIndex - s.LastSnapshot
if s.LastSnapshot > s.AppliedIndex {
gap = 0
}
if gap < 10000 {
r = append(r, inspector.Pass("rqlite.snapshot_recent", "Snapshot recent", rqliteSub, node,
fmt.Sprintf("snapshot_index=%d applied=%d gap=%d", s.LastSnapshot, s.AppliedIndex, gap), inspector.Medium))
} else {
r = append(r, inspector.Warn("rqlite.snapshot_recent", "Snapshot recent", rqliteSub, node,
fmt.Sprintf("snapshot_index=%d applied=%d gap=%d (old snapshot)", s.LastSnapshot, s.AppliedIndex, gap), inspector.Medium))
}
}
// 1.19 At least 1 snapshot exists
if s.LastSnapshot > 0 {
r = append(r, inspector.Pass("rqlite.has_snapshot", "At least one snapshot exists", rqliteSub, node,
fmt.Sprintf("last_snapshot_index=%d", s.LastSnapshot), inspector.Medium))
} else {
r = append(r, inspector.Warn("rqlite.has_snapshot", "At least one snapshot exists", rqliteSub, node,
"no snapshots found", inspector.Medium))
}
// 1.27 Database size
if s.DBSizeFriendly != "" {
r = append(r, inspector.Pass("rqlite.db_size", "Database size reported", rqliteSub, node,
fmt.Sprintf("db_size=%s", s.DBSizeFriendly), inspector.Low))
}
// 1.31 Goroutine count
if s.Goroutines > 0 {
if s.Goroutines < 200 {
r = append(r, inspector.Pass("rqlite.goroutines", "Goroutine count healthy", rqliteSub, node,
fmt.Sprintf("goroutines=%d", s.Goroutines), inspector.Medium))
} else if s.Goroutines < 1000 {
r = append(r, inspector.Warn("rqlite.goroutines", "Goroutine count healthy", rqliteSub, node,
fmt.Sprintf("goroutines=%d (elevated)", s.Goroutines), inspector.Medium))
} else {
r = append(r, inspector.Fail("rqlite.goroutines", "Goroutine count healthy", rqliteSub, node,
fmt.Sprintf("goroutines=%d (high)", s.Goroutines), inspector.High))
}
}
// 1.32 Memory (HeapAlloc)
if s.HeapAlloc > 0 {
mb := s.HeapAlloc / (1024 * 1024)
if mb < 500 {
r = append(r, inspector.Pass("rqlite.memory", "Memory usage healthy", rqliteSub, node,
fmt.Sprintf("heap=%dMB", mb), inspector.Medium))
} else if mb < 1000 {
r = append(r, inspector.Warn("rqlite.memory", "Memory usage healthy", rqliteSub, node,
fmt.Sprintf("heap=%dMB (elevated)", mb), inspector.Medium))
} else {
r = append(r, inspector.Fail("rqlite.memory", "Memory usage healthy", rqliteSub, node,
fmt.Sprintf("heap=%dMB (high)", mb), inspector.High))
}
}
// 1.35 Version reported
if s.Version != "" {
r = append(r, inspector.Pass("rqlite.version", "Version reported", rqliteSub, node,
fmt.Sprintf("version=%s", s.Version), inspector.Low))
}
// Node reachability from /nodes endpoint
if rq.Nodes != nil {
unreachable := 0
for addr, n := range rq.Nodes {
if !n.Reachable {
unreachable++
r = append(r, inspector.Fail("rqlite.node_reachable", "Cluster node reachable", rqliteSub, node,
fmt.Sprintf("%s is unreachable from this node", addr), inspector.Critical))
}
}
if unreachable == 0 {
r = append(r, inspector.Pass("rqlite.all_reachable", "All cluster nodes reachable", rqliteSub, node,
fmt.Sprintf("all %d nodes reachable", len(rq.Nodes)), inspector.Critical))
}
}
// 1.46 Strong read test
if rq.StrongRead {
r = append(r, inspector.Pass("rqlite.strong_read", "Strong read succeeds", rqliteSub, node,
"SELECT 1 at level=strong OK", inspector.Critical))
} else if rq.Responsive {
r = append(r, inspector.Fail("rqlite.strong_read", "Strong read succeeds", rqliteSub, node,
"SELECT 1 at level=strong failed", inspector.Critical))
}
// Debug vars checks
if dv := rq.DebugVars; dv != nil {
// 1.28 Query errors
if dv.QueryErrors == 0 {
r = append(r, inspector.Pass("rqlite.query_errors", "No query errors", rqliteSub, node,
"query_errors=0", inspector.High))
} else {
r = append(r, inspector.Warn("rqlite.query_errors", "No query errors", rqliteSub, node,
fmt.Sprintf("query_errors=%d", dv.QueryErrors), inspector.High))
}
// 1.29 Execute errors
if dv.ExecuteErrors == 0 {
r = append(r, inspector.Pass("rqlite.execute_errors", "No execute errors", rqliteSub, node,
"execute_errors=0", inspector.High))
} else {
r = append(r, inspector.Warn("rqlite.execute_errors", "No execute errors", rqliteSub, node,
fmt.Sprintf("execute_errors=%d", dv.ExecuteErrors), inspector.High))
}
// 1.30 Leader not found events
if dv.LeaderNotFound == 0 {
r = append(r, inspector.Pass("rqlite.leader_not_found", "No leader-not-found events", rqliteSub, node,
"leader_not_found=0", inspector.Critical))
} else {
r = append(r, inspector.Fail("rqlite.leader_not_found", "No leader-not-found events", rqliteSub, node,
fmt.Sprintf("leader_not_found=%d", dv.LeaderNotFound), inspector.Critical))
}
// Snapshot errors
if dv.SnapshotErrors == 0 {
r = append(r, inspector.Pass("rqlite.snapshot_errors", "No snapshot errors", rqliteSub, node,
"snapshot_errors=0", inspector.High))
} else {
r = append(r, inspector.Fail("rqlite.snapshot_errors", "No snapshot errors", rqliteSub, node,
fmt.Sprintf("snapshot_errors=%d", dv.SnapshotErrors), inspector.High))
}
// Client retries/timeouts
if dv.ClientRetries == 0 && dv.ClientTimeouts == 0 {
r = append(r, inspector.Pass("rqlite.client_health", "No client retries or timeouts", rqliteSub, node,
"retries=0 timeouts=0", inspector.Medium))
} else {
r = append(r, inspector.Warn("rqlite.client_health", "No client retries or timeouts", rqliteSub, node,
fmt.Sprintf("retries=%d timeouts=%d", dv.ClientRetries, dv.ClientTimeouts), inspector.Medium))
}
}
return r
}
func checkRQLiteCrossNode(data *inspector.ClusterData) []inspector.CheckResult {
var r []inspector.CheckResult
type nodeInfo struct {
host string
name string
status *inspector.RQLiteStatus
}
var nodes []nodeInfo
for host, nd := range data.Nodes {
if nd.RQLite != nil && nd.RQLite.Status != nil {
nodes = append(nodes, nodeInfo{host: host, name: nd.Node.Name(), status: nd.RQLite.Status})
}
}
if len(nodes) < 2 {
r = append(r, inspector.Skip("rqlite.cross_node", "Cross-node checks", rqliteSub, "",
fmt.Sprintf("only %d node(s) with RQLite data, need >=2", len(nodes)), inspector.Critical))
return r
}
// 1.5 Exactly one leader
leaders := 0
var leaderName string
for _, n := range nodes {
if n.status.RaftState == "Leader" {
leaders++
leaderName = n.name
}
}
switch leaders {
case 1:
r = append(r, inspector.Pass("rqlite.single_leader", "Exactly one leader in cluster", rqliteSub, "",
fmt.Sprintf("leader=%s", leaderName), inspector.Critical))
case 0:
r = append(r, inspector.Fail("rqlite.single_leader", "Exactly one leader in cluster", rqliteSub, "",
"no leader found", inspector.Critical))
default:
r = append(r, inspector.Fail("rqlite.single_leader", "Exactly one leader in cluster", rqliteSub, "",
fmt.Sprintf("found %d leaders (split brain!)", leaders), inspector.Critical))
}
// 1.6 Term consistency
terms := map[uint64][]string{}
for _, n := range nodes {
terms[n.status.Term] = append(terms[n.status.Term], n.name)
}
if len(terms) == 1 {
for t := range terms {
r = append(r, inspector.Pass("rqlite.term_consistent", "All nodes same Raft term", rqliteSub, "",
fmt.Sprintf("term=%d across %d nodes", t, len(nodes)), inspector.Critical))
}
} else {
var parts []string
for t, names := range terms {
parts = append(parts, fmt.Sprintf("term=%d: %s", t, strings.Join(names, ",")))
}
r = append(r, inspector.Fail("rqlite.term_consistent", "All nodes same Raft term", rqliteSub, "",
"term divergence: "+strings.Join(parts, "; "), inspector.Critical))
}
// 1.36 All nodes agree on same leader
leaderIDs := map[string][]string{}
for _, n := range nodes {
leaderIDs[n.status.LeaderNodeID] = append(leaderIDs[n.status.LeaderNodeID], n.name)
}
if len(leaderIDs) == 1 {
for lid := range leaderIDs {
r = append(r, inspector.Pass("rqlite.leader_agreement", "All nodes agree on leader", rqliteSub, "",
fmt.Sprintf("leader_id=%s", lid), inspector.Critical))
}
} else {
var parts []string
for lid, names := range leaderIDs {
id := lid
if id == "" {
id = "(none)"
}
parts = append(parts, fmt.Sprintf("%s: %s", id, strings.Join(names, ",")))
}
r = append(r, inspector.Fail("rqlite.leader_agreement", "All nodes agree on leader", rqliteSub, "",
"leader disagreement: "+strings.Join(parts, "; "), inspector.Critical))
}
// 1.38 Applied index convergence
var minApplied, maxApplied uint64
hasApplied := false
for _, n := range nodes {
idx := n.status.AppliedIndex
if idx == 0 {
continue
}
if !hasApplied {
minApplied = idx
maxApplied = idx
hasApplied = true
continue
}
if idx < minApplied {
minApplied = idx
}
if idx > maxApplied {
maxApplied = idx
}
}
if hasApplied && maxApplied > 0 {
gap := maxApplied - minApplied
if gap < 100 {
r = append(r, inspector.Pass("rqlite.index_convergence", "Applied index convergence", rqliteSub, "",
fmt.Sprintf("min=%d max=%d gap=%d", minApplied, maxApplied, gap), inspector.Critical))
} else if gap < 1000 {
r = append(r, inspector.Warn("rqlite.index_convergence", "Applied index convergence", rqliteSub, "",
fmt.Sprintf("min=%d max=%d gap=%d (lagging)", minApplied, maxApplied, gap), inspector.Critical))
} else {
r = append(r, inspector.Fail("rqlite.index_convergence", "Applied index convergence", rqliteSub, "",
fmt.Sprintf("min=%d max=%d gap=%d (severely behind)", minApplied, maxApplied, gap), inspector.Critical))
}
}
// 1.35 Version consistency
versions := map[string][]string{}
for _, n := range nodes {
if n.status.Version != "" {
versions[n.status.Version] = append(versions[n.status.Version], n.name)
}
}
if len(versions) == 1 {
for v := range versions {
r = append(r, inspector.Pass("rqlite.version_consistent", "Version consistent across nodes", rqliteSub, "",
fmt.Sprintf("version=%s", v), inspector.Medium))
}
} else if len(versions) > 1 {
var parts []string
for v, names := range versions {
parts = append(parts, fmt.Sprintf("%s: %s", v, strings.Join(names, ",")))
}
r = append(r, inspector.Warn("rqlite.version_consistent", "Version consistent across nodes", rqliteSub, "",
"version mismatch: "+strings.Join(parts, "; "), inspector.Medium))
}
// 1.40 Database size convergence
type sizeEntry struct {
name string
size int64
}
var sizes []sizeEntry
for _, n := range nodes {
if n.status.DBSize > 0 {
sizes = append(sizes, sizeEntry{n.name, n.status.DBSize})
}
}
if len(sizes) >= 2 {
minSize := sizes[0].size
maxSize := sizes[0].size
for _, s := range sizes[1:] {
if s.size < minSize {
minSize = s.size
}
if s.size > maxSize {
maxSize = s.size
}
}
if minSize > 0 {
ratio := float64(maxSize) / float64(minSize)
if ratio <= 1.05 {
r = append(r, inspector.Pass("rqlite.db_size_convergence", "Database size converged", rqliteSub, "",
fmt.Sprintf("min=%dB max=%dB ratio=%.2f", minSize, maxSize, ratio), inspector.Medium))
} else {
r = append(r, inspector.Warn("rqlite.db_size_convergence", "Database size converged", rqliteSub, "",
fmt.Sprintf("min=%dB max=%dB ratio=%.2f (diverged)", minSize, maxSize, ratio), inspector.High))
}
}
}
// 1.42 Quorum math
voters := 0
reachableVoters := 0
for _, n := range nodes {
if n.status.Voter {
voters++
reachableVoters++ // responded to SSH + curl = reachable
}
}
quorumNeeded := int(math.Floor(float64(voters)/2)) + 1
if reachableVoters >= quorumNeeded {
r = append(r, inspector.Pass("rqlite.quorum", "Quorum maintained", rqliteSub, "",
fmt.Sprintf("reachable_voters=%d/%d quorum_needed=%d", reachableVoters, voters, quorumNeeded), inspector.Critical))
} else {
r = append(r, inspector.Fail("rqlite.quorum", "Quorum maintained", rqliteSub, "",
fmt.Sprintf("reachable_voters=%d/%d quorum_needed=%d (QUORUM LOST)", reachableVoters, voters, quorumNeeded), inspector.Critical))
}
return r
}
// countRQLiteNodes counts nodes that have RQLite data.
func countRQLiteNodes(data *inspector.ClusterData) int {
count := 0
for _, nd := range data.Nodes {
if nd.RQLite != nil {
count++
}
}
return count
}

View File

@ -0,0 +1,401 @@
package checks
import (
"testing"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func TestCheckRQLite_Unresponsive(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.RQLite = &inspector.RQLiteData{Responsive: false}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.responsive", inspector.StatusFail)
// Should return early — no raft_state check
if findCheck(results, "rqlite.raft_state") != nil {
t.Error("should not check raft_state when unresponsive")
}
}
func TestCheckRQLite_HealthyLeader(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
StrongRead: true,
Readyz: &inspector.RQLiteReadyz{Ready: true, Node: "ready", Leader: "ready", Store: "ready"},
Status: &inspector.RQLiteStatus{
RaftState: "Leader",
LeaderNodeID: "node1",
Voter: true,
NumPeers: 2,
Term: 5,
CommitIndex: 1000,
AppliedIndex: 1000,
FsmPending: 0,
LastLogTerm: 5,
DBAppliedIndex: 1000,
FsmIndex: 1000,
LastSnapshot: 995,
DBSizeFriendly: "1.2MB",
Goroutines: 50,
HeapAlloc: 100 * 1024 * 1024, // 100MB
Version: "8.0.0",
},
Nodes: map[string]*inspector.RQLiteNode{
"node1:5001": {Addr: "node1:5001", Reachable: true, Leader: true, Voter: true},
"node2:5001": {Addr: "node2:5001", Reachable: true, Leader: false, Voter: true},
"node3:5001": {Addr: "node3:5001", Reachable: true, Leader: false, Voter: true},
},
DebugVars: &inspector.RQLiteDebugVars{},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.responsive", inspector.StatusPass)
expectStatus(t, results, "rqlite.readyz", inspector.StatusPass)
expectStatus(t, results, "rqlite.raft_state", inspector.StatusPass)
expectStatus(t, results, "rqlite.leader_known", inspector.StatusPass)
expectStatus(t, results, "rqlite.voter", inspector.StatusPass)
expectStatus(t, results, "rqlite.commit_applied_gap", inspector.StatusPass)
expectStatus(t, results, "rqlite.fsm_pending", inspector.StatusPass)
expectStatus(t, results, "rqlite.db_fsm_sync", inspector.StatusPass)
expectStatus(t, results, "rqlite.strong_read", inspector.StatusPass)
expectStatus(t, results, "rqlite.all_reachable", inspector.StatusPass)
expectStatus(t, results, "rqlite.goroutines", inspector.StatusPass)
expectStatus(t, results, "rqlite.memory", inspector.StatusPass)
expectStatus(t, results, "rqlite.query_errors", inspector.StatusPass)
expectStatus(t, results, "rqlite.execute_errors", inspector.StatusPass)
expectStatus(t, results, "rqlite.leader_not_found", inspector.StatusPass)
expectStatus(t, results, "rqlite.snapshot_errors", inspector.StatusPass)
expectStatus(t, results, "rqlite.client_health", inspector.StatusPass)
}
func TestCheckRQLite_RaftStates(t *testing.T) {
tests := []struct {
state string
status inspector.Status
}{
{"Leader", inspector.StatusPass},
{"Follower", inspector.StatusPass},
{"Candidate", inspector.StatusWarn},
{"Shutdown", inspector.StatusFail},
{"Unknown", inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.state, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Status: &inspector.RQLiteStatus{
RaftState: tt.state,
LeaderNodeID: "node1",
Voter: true,
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.raft_state", tt.status)
})
}
}
func TestCheckRQLite_ReadyzFail(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Readyz: &inspector.RQLiteReadyz{Ready: false, Node: "ready", Leader: "not ready", Store: "ready"},
Status: &inspector.RQLiteStatus{RaftState: "Follower", LeaderNodeID: "n1", Voter: true},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.readyz", inspector.StatusFail)
}
func TestCheckRQLite_CommitAppliedGap(t *testing.T) {
tests := []struct {
name string
commit uint64
applied uint64
status inspector.Status
}{
{"no gap", 1000, 1000, inspector.StatusPass},
{"small gap", 1002, 1000, inspector.StatusPass},
{"lagging", 1050, 1000, inspector.StatusWarn},
{"severely behind", 2000, 1000, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Status: &inspector.RQLiteStatus{
RaftState: "Follower",
LeaderNodeID: "n1",
Voter: true,
CommitIndex: tt.commit,
AppliedIndex: tt.applied,
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.commit_applied_gap", tt.status)
})
}
}
func TestCheckRQLite_FsmPending(t *testing.T) {
tests := []struct {
name string
pending uint64
status inspector.Status
}{
{"zero", 0, inspector.StatusPass},
{"small", 5, inspector.StatusWarn},
{"backlog", 100, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Status: &inspector.RQLiteStatus{
RaftState: "Follower",
LeaderNodeID: "n1",
Voter: true,
FsmPending: tt.pending,
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.fsm_pending", tt.status)
})
}
}
func TestCheckRQLite_StrongReadFail(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
StrongRead: false,
Status: &inspector.RQLiteStatus{RaftState: "Follower", LeaderNodeID: "n1", Voter: true},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.strong_read", inspector.StatusFail)
}
func TestCheckRQLite_DebugVarsErrors(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Status: &inspector.RQLiteStatus{RaftState: "Leader", LeaderNodeID: "n1", Voter: true},
DebugVars: &inspector.RQLiteDebugVars{
QueryErrors: 5,
ExecuteErrors: 3,
LeaderNotFound: 1,
SnapshotErrors: 2,
ClientRetries: 10,
ClientTimeouts: 1,
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.query_errors", inspector.StatusWarn)
expectStatus(t, results, "rqlite.execute_errors", inspector.StatusWarn)
expectStatus(t, results, "rqlite.leader_not_found", inspector.StatusFail)
expectStatus(t, results, "rqlite.snapshot_errors", inspector.StatusFail)
expectStatus(t, results, "rqlite.client_health", inspector.StatusWarn)
}
func TestCheckRQLite_Goroutines(t *testing.T) {
tests := []struct {
name string
goroutines int
status inspector.Status
}{
{"healthy", 50, inspector.StatusPass},
{"elevated", 500, inspector.StatusWarn},
{"high", 2000, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Status: &inspector.RQLiteStatus{
RaftState: "Leader",
LeaderNodeID: "n1",
Voter: true,
Goroutines: tt.goroutines,
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.goroutines", tt.status)
})
}
}
// --- Cross-node tests ---
func makeRQLiteCluster(leaderHost string, states map[string]string, term uint64) *inspector.ClusterData {
nodes := map[string]*inspector.NodeData{}
rqliteNodes := map[string]*inspector.RQLiteNode{}
for host := range states {
rqliteNodes[host+":5001"] = &inspector.RQLiteNode{
Addr: host + ":5001", Reachable: true, Voter: true,
Leader: states[host] == "Leader",
}
}
for host, state := range states {
nd := makeNodeData(host, "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Status: &inspector.RQLiteStatus{
RaftState: state,
LeaderNodeID: leaderHost,
Voter: true,
Term: term,
AppliedIndex: 1000,
CommitIndex: 1000,
Version: "8.0.0",
DBSize: 4096,
},
Nodes: rqliteNodes,
}
nodes[host] = nd
}
return makeCluster(nodes)
}
func TestCheckRQLite_CrossNode_SingleLeader(t *testing.T) {
data := makeRQLiteCluster("1.1.1.1", map[string]string{
"1.1.1.1": "Leader",
"2.2.2.2": "Follower",
"3.3.3.3": "Follower",
}, 5)
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.single_leader", inspector.StatusPass)
expectStatus(t, results, "rqlite.term_consistent", inspector.StatusPass)
expectStatus(t, results, "rqlite.leader_agreement", inspector.StatusPass)
expectStatus(t, results, "rqlite.index_convergence", inspector.StatusPass)
expectStatus(t, results, "rqlite.version_consistent", inspector.StatusPass)
expectStatus(t, results, "rqlite.quorum", inspector.StatusPass)
}
func TestCheckRQLite_CrossNode_NoLeader(t *testing.T) {
data := makeRQLiteCluster("", map[string]string{
"1.1.1.1": "Candidate",
"2.2.2.2": "Candidate",
"3.3.3.3": "Candidate",
}, 5)
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.single_leader", inspector.StatusFail)
}
func TestCheckRQLite_CrossNode_SplitBrain(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
for _, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
nd := makeNodeData(host, "node")
state := "Follower"
leaderID := "1.1.1.1"
if host == "1.1.1.1" || host == "2.2.2.2" {
state = "Leader"
leaderID = host
}
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Status: &inspector.RQLiteStatus{
RaftState: state,
LeaderNodeID: leaderID,
Voter: true,
Term: 5,
AppliedIndex: 1000,
},
}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.single_leader", inspector.StatusFail)
}
func TestCheckRQLite_CrossNode_TermDivergence(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
terms := map[string]uint64{"1.1.1.1": 5, "2.2.2.2": 5, "3.3.3.3": 6}
for host, term := range terms {
nd := makeNodeData(host, "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Status: &inspector.RQLiteStatus{
RaftState: "Follower",
LeaderNodeID: "1.1.1.1",
Voter: true,
Term: term,
AppliedIndex: 1000,
},
}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.term_consistent", inspector.StatusFail)
}
func TestCheckRQLite_CrossNode_IndexLagging(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
applied := map[string]uint64{"1.1.1.1": 1000, "2.2.2.2": 1000, "3.3.3.3": 500}
for host, idx := range applied {
nd := makeNodeData(host, "node")
state := "Follower"
if host == "1.1.1.1" {
state = "Leader"
}
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Status: &inspector.RQLiteStatus{
RaftState: state,
LeaderNodeID: "1.1.1.1",
Voter: true,
Term: 5,
AppliedIndex: idx,
CommitIndex: idx,
},
}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.index_convergence", inspector.StatusWarn)
}
func TestCheckRQLite_CrossNode_SkipSingleNode(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.RQLite = &inspector.RQLiteData{
Responsive: true,
Status: &inspector.RQLiteStatus{RaftState: "Leader", LeaderNodeID: "n1", Voter: true, Term: 5, AppliedIndex: 1000},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
expectStatus(t, results, "rqlite.cross_node", inspector.StatusSkip)
}
func TestCheckRQLite_NilRQLiteData(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
// nd.RQLite is nil — no per-node checks, but cross-node skip is expected
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckRQLite(data)
// Should only have the cross-node skip (not enough nodes)
for _, r := range results {
if r.Status != inspector.StatusSkip {
t.Errorf("unexpected non-skip result: %s (status=%s)", r.ID, r.Status)
}
}
}

View File

@ -0,0 +1,242 @@
package checks
import (
"fmt"
"strconv"
"strings"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func init() {
inspector.RegisterChecker("system", CheckSystem)
}
const systemSub = "system"
// CheckSystem runs all system-level health checks.
func CheckSystem(data *inspector.ClusterData) []inspector.CheckResult {
var results []inspector.CheckResult
for _, nd := range data.Nodes {
if nd.System == nil {
continue
}
results = append(results, checkSystemPerNode(nd)...)
}
return results
}
func checkSystemPerNode(nd *inspector.NodeData) []inspector.CheckResult {
var r []inspector.CheckResult
sys := nd.System
node := nd.Node.Name()
// 6.1 Core services active
coreServices := []string{"debros-node", "debros-olric", "debros-ipfs", "debros-ipfs-cluster"}
for _, svc := range coreServices {
status, ok := sys.Services[svc]
if !ok {
status = "unknown"
}
id := fmt.Sprintf("system.svc_%s", strings.ReplaceAll(svc, "-", "_"))
name := fmt.Sprintf("%s service active", svc)
if status == "active" {
r = append(r, inspector.Pass(id, name, systemSub, node, "active", inspector.Critical))
} else {
r = append(r, inspector.Fail(id, name, systemSub, node,
fmt.Sprintf("status=%s", status), inspector.Critical))
}
}
// 6.5 WireGuard service
if status, ok := sys.Services["wg-quick@wg0"]; ok {
if status == "active" {
r = append(r, inspector.Pass("system.svc_wg", "wg-quick@wg0 active", systemSub, node, "active", inspector.Critical))
} else {
r = append(r, inspector.Fail("system.svc_wg", "wg-quick@wg0 active", systemSub, node,
fmt.Sprintf("status=%s", status), inspector.Critical))
}
}
// 6.3 Nameserver services (if applicable)
if nd.Node.IsNameserver() {
for _, svc := range []string{"coredns", "caddy"} {
status, ok := sys.Services[svc]
if !ok {
status = "unknown"
}
id := fmt.Sprintf("system.svc_%s", svc)
name := fmt.Sprintf("%s service active", svc)
if status == "active" {
r = append(r, inspector.Pass(id, name, systemSub, node, "active", inspector.Critical))
} else {
r = append(r, inspector.Fail(id, name, systemSub, node,
fmt.Sprintf("status=%s", status), inspector.Critical))
}
}
}
// 6.6 Failed systemd units
if len(sys.FailedUnits) == 0 {
r = append(r, inspector.Pass("system.no_failed_units", "No failed systemd units", systemSub, node,
"no failed units", inspector.High))
} else {
r = append(r, inspector.Fail("system.no_failed_units", "No failed systemd units", systemSub, node,
fmt.Sprintf("failed: %s", strings.Join(sys.FailedUnits, ", ")), inspector.High))
}
// 6.14 Memory usage
if sys.MemTotalMB > 0 {
pct := float64(sys.MemUsedMB) / float64(sys.MemTotalMB) * 100
if pct < 80 {
r = append(r, inspector.Pass("system.memory", "Memory usage healthy", systemSub, node,
fmt.Sprintf("used=%dMB/%dMB (%.0f%%)", sys.MemUsedMB, sys.MemTotalMB, pct), inspector.Medium))
} else if pct < 90 {
r = append(r, inspector.Warn("system.memory", "Memory usage healthy", systemSub, node,
fmt.Sprintf("used=%dMB/%dMB (%.0f%%)", sys.MemUsedMB, sys.MemTotalMB, pct), inspector.High))
} else {
r = append(r, inspector.Fail("system.memory", "Memory usage healthy", systemSub, node,
fmt.Sprintf("used=%dMB/%dMB (%.0f%% CRITICAL)", sys.MemUsedMB, sys.MemTotalMB, pct), inspector.Critical))
}
}
// 6.15 Disk usage
if sys.DiskUsePct > 0 {
if sys.DiskUsePct < 80 {
r = append(r, inspector.Pass("system.disk", "Disk usage healthy", systemSub, node,
fmt.Sprintf("used=%s/%s (%d%%)", sys.DiskUsedGB, sys.DiskTotalGB, sys.DiskUsePct), inspector.High))
} else if sys.DiskUsePct < 90 {
r = append(r, inspector.Warn("system.disk", "Disk usage healthy", systemSub, node,
fmt.Sprintf("used=%s/%s (%d%%)", sys.DiskUsedGB, sys.DiskTotalGB, sys.DiskUsePct), inspector.High))
} else {
r = append(r, inspector.Fail("system.disk", "Disk usage healthy", systemSub, node,
fmt.Sprintf("used=%s/%s (%d%% CRITICAL)", sys.DiskUsedGB, sys.DiskTotalGB, sys.DiskUsePct), inspector.Critical))
}
}
// 6.17 Load average vs CPU count
if sys.LoadAvg != "" && sys.CPUCount > 0 {
parts := strings.Split(strings.TrimSpace(sys.LoadAvg), ",")
if len(parts) >= 1 {
load1, err := strconv.ParseFloat(strings.TrimSpace(parts[0]), 64)
if err == nil {
cpus := float64(sys.CPUCount)
if load1 < cpus {
r = append(r, inspector.Pass("system.load", "Load average healthy", systemSub, node,
fmt.Sprintf("load1=%.1f cpus=%d", load1, sys.CPUCount), inspector.Medium))
} else if load1 < cpus*2 {
r = append(r, inspector.Warn("system.load", "Load average healthy", systemSub, node,
fmt.Sprintf("load1=%.1f cpus=%d (elevated)", load1, sys.CPUCount), inspector.Medium))
} else {
r = append(r, inspector.Fail("system.load", "Load average healthy", systemSub, node,
fmt.Sprintf("load1=%.1f cpus=%d (overloaded)", load1, sys.CPUCount), inspector.High))
}
}
}
}
// 6.18 OOM kills
if sys.OOMKills == 0 {
r = append(r, inspector.Pass("system.oom", "No OOM kills", systemSub, node,
"no OOM kills in dmesg", inspector.Critical))
} else {
r = append(r, inspector.Fail("system.oom", "No OOM kills", systemSub, node,
fmt.Sprintf("%d OOM kills in dmesg", sys.OOMKills), inspector.Critical))
}
// 6.19 Swap usage
if sys.SwapTotalMB > 0 {
pct := float64(sys.SwapUsedMB) / float64(sys.SwapTotalMB) * 100
if pct < 30 {
r = append(r, inspector.Pass("system.swap", "Swap usage low", systemSub, node,
fmt.Sprintf("swap=%dMB/%dMB (%.0f%%)", sys.SwapUsedMB, sys.SwapTotalMB, pct), inspector.Medium))
} else {
r = append(r, inspector.Warn("system.swap", "Swap usage low", systemSub, node,
fmt.Sprintf("swap=%dMB/%dMB (%.0f%%)", sys.SwapUsedMB, sys.SwapTotalMB, pct), inspector.Medium))
}
}
// 6.20 Uptime
if sys.UptimeRaw != "" && sys.UptimeRaw != "unknown" {
r = append(r, inspector.Pass("system.uptime", "System uptime reported", systemSub, node,
fmt.Sprintf("up since %s", sys.UptimeRaw), inspector.Low))
}
// 6.21 Inode usage
if sys.InodePct > 0 {
if sys.InodePct < 80 {
r = append(r, inspector.Pass("system.inodes", "Inode usage healthy", systemSub, node,
fmt.Sprintf("inode_use=%d%%", sys.InodePct), inspector.High))
} else if sys.InodePct < 95 {
r = append(r, inspector.Warn("system.inodes", "Inode usage healthy", systemSub, node,
fmt.Sprintf("inode_use=%d%% (elevated)", sys.InodePct), inspector.High))
} else {
r = append(r, inspector.Fail("system.inodes", "Inode usage healthy", systemSub, node,
fmt.Sprintf("inode_use=%d%% (CRITICAL)", sys.InodePct), inspector.Critical))
}
}
// 6.22 UFW firewall
if sys.UFWActive {
r = append(r, inspector.Pass("system.ufw", "UFW firewall active", systemSub, node,
"ufw is active", inspector.High))
} else {
r = append(r, inspector.Warn("system.ufw", "UFW firewall active", systemSub, node,
"ufw is not active", inspector.High))
}
// 6.23 Process user
if sys.ProcessUser != "" && sys.ProcessUser != "unknown" {
if sys.ProcessUser == "debros" {
r = append(r, inspector.Pass("system.process_user", "debros-node runs as correct user", systemSub, node,
"user=debros", inspector.High))
} else if sys.ProcessUser == "root" {
r = append(r, inspector.Warn("system.process_user", "debros-node runs as correct user", systemSub, node,
"user=root (should be debros)", inspector.High))
} else {
r = append(r, inspector.Warn("system.process_user", "debros-node runs as correct user", systemSub, node,
fmt.Sprintf("user=%s (expected debros)", sys.ProcessUser), inspector.Medium))
}
}
// 6.24 Panic/fatal in logs
if sys.PanicCount == 0 {
r = append(r, inspector.Pass("system.panics", "No panics in recent logs", systemSub, node,
"0 panic/fatal in last hour", inspector.Critical))
} else {
r = append(r, inspector.Fail("system.panics", "No panics in recent logs", systemSub, node,
fmt.Sprintf("%d panic/fatal in last hour", sys.PanicCount), inspector.Critical))
}
// 6.25 Expected ports listening
expectedPorts := map[int]string{
5001: "RQLite HTTP",
3322: "Olric Memberlist",
6001: "Gateway",
4501: "IPFS API",
}
for port, svcName := range expectedPorts {
found := false
for _, p := range sys.ListeningPorts {
if p == port {
found = true
break
}
}
if found {
r = append(r, inspector.Pass(
fmt.Sprintf("system.port_%d", port),
fmt.Sprintf("%s port %d listening", svcName, port),
systemSub, node, "port is bound", inspector.High))
} else {
r = append(r, inspector.Warn(
fmt.Sprintf("system.port_%d", port),
fmt.Sprintf("%s port %d listening", svcName, port),
systemSub, node, "port is NOT bound", inspector.High))
}
}
return r
}

View File

@ -0,0 +1,284 @@
package checks
import (
"testing"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func TestCheckSystem_HealthyNode(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{
Services: map[string]string{
"debros-node": "active",
"debros-olric": "active",
"debros-ipfs": "active",
"debros-ipfs-cluster": "active",
"wg-quick@wg0": "active",
},
FailedUnits: nil,
MemTotalMB: 8192,
MemUsedMB: 4096,
DiskUsePct: 50,
DiskUsedGB: "25G",
DiskTotalGB: "50G",
LoadAvg: "1.0, 0.8, 0.5",
CPUCount: 4,
OOMKills: 0,
SwapTotalMB: 2048,
SwapUsedMB: 100,
UptimeRaw: "2024-01-01 00:00:00",
InodePct: 10,
ListeningPorts: []int{5001, 3322, 6001, 4501},
UFWActive: true,
ProcessUser: "debros",
PanicCount: 0,
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.svc_debros_node", inspector.StatusPass)
expectStatus(t, results, "system.svc_debros_olric", inspector.StatusPass)
expectStatus(t, results, "system.svc_debros_ipfs", inspector.StatusPass)
expectStatus(t, results, "system.svc_debros_ipfs_cluster", inspector.StatusPass)
expectStatus(t, results, "system.svc_wg", inspector.StatusPass)
expectStatus(t, results, "system.no_failed_units", inspector.StatusPass)
expectStatus(t, results, "system.memory", inspector.StatusPass)
expectStatus(t, results, "system.disk", inspector.StatusPass)
expectStatus(t, results, "system.load", inspector.StatusPass)
expectStatus(t, results, "system.oom", inspector.StatusPass)
expectStatus(t, results, "system.swap", inspector.StatusPass)
expectStatus(t, results, "system.inodes", inspector.StatusPass)
expectStatus(t, results, "system.ufw", inspector.StatusPass)
expectStatus(t, results, "system.process_user", inspector.StatusPass)
expectStatus(t, results, "system.panics", inspector.StatusPass)
expectStatus(t, results, "system.port_5001", inspector.StatusPass)
expectStatus(t, results, "system.port_3322", inspector.StatusPass)
expectStatus(t, results, "system.port_6001", inspector.StatusPass)
expectStatus(t, results, "system.port_4501", inspector.StatusPass)
}
func TestCheckSystem_ServiceInactive(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{
Services: map[string]string{
"debros-node": "active",
"debros-olric": "inactive",
"debros-ipfs": "active",
"debros-ipfs-cluster": "failed",
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.svc_debros_node", inspector.StatusPass)
expectStatus(t, results, "system.svc_debros_olric", inspector.StatusFail)
expectStatus(t, results, "system.svc_debros_ipfs_cluster", inspector.StatusFail)
}
func TestCheckSystem_NameserverServices(t *testing.T) {
nd := makeNodeData("5.5.5.5", "nameserver-ns1")
nd.System = &inspector.SystemData{
Services: map[string]string{
"debros-node": "active",
"debros-olric": "active",
"debros-ipfs": "active",
"debros-ipfs-cluster": "active",
"coredns": "active",
"caddy": "active",
},
}
data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.svc_coredns", inspector.StatusPass)
expectStatus(t, results, "system.svc_caddy", inspector.StatusPass)
}
func TestCheckSystem_NameserverServicesNotCheckedOnRegularNode(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{
Services: map[string]string{
"debros-node": "active",
"debros-olric": "active",
"debros-ipfs": "active",
"debros-ipfs-cluster": "active",
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
if findCheck(results, "system.svc_coredns") != nil {
t.Error("should not check coredns on regular node")
}
}
func TestCheckSystem_FailedUnits(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{
Services: map[string]string{},
FailedUnits: []string{"some-service.service"},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.no_failed_units", inspector.StatusFail)
}
func TestCheckSystem_Memory(t *testing.T) {
tests := []struct {
name string
used int
total int
status inspector.Status
}{
{"healthy", 4000, 8000, inspector.StatusPass}, // 50%
{"elevated", 7000, 8000, inspector.StatusWarn}, // 87.5%
{"critical", 7500, 8000, inspector.StatusFail}, // 93.75%
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{
Services: map[string]string{},
MemTotalMB: tt.total,
MemUsedMB: tt.used,
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.memory", tt.status)
})
}
}
func TestCheckSystem_Disk(t *testing.T) {
tests := []struct {
name string
pct int
status inspector.Status
}{
{"healthy", 60, inspector.StatusPass},
{"elevated", 85, inspector.StatusWarn},
{"critical", 92, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{
Services: map[string]string{},
DiskUsePct: tt.pct,
DiskUsedGB: "25G",
DiskTotalGB: "50G",
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.disk", tt.status)
})
}
}
func TestCheckSystem_Load(t *testing.T) {
tests := []struct {
name string
load string
cpus int
status inspector.Status
}{
{"healthy", "1.0, 0.8, 0.5", 4, inspector.StatusPass},
{"elevated", "6.0, 5.0, 4.0", 4, inspector.StatusWarn},
{"overloaded", "10.0, 9.0, 8.0", 4, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{
Services: map[string]string{},
LoadAvg: tt.load,
CPUCount: tt.cpus,
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.load", tt.status)
})
}
}
func TestCheckSystem_OOMKills(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{Services: map[string]string{}, OOMKills: 3}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.oom", inspector.StatusFail)
}
func TestCheckSystem_Inodes(t *testing.T) {
tests := []struct {
name string
pct int
status inspector.Status
}{
{"healthy", 50, inspector.StatusPass},
{"elevated", 82, inspector.StatusWarn},
{"critical", 96, inspector.StatusFail},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{Services: map[string]string{}, InodePct: tt.pct}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.inodes", tt.status)
})
}
}
func TestCheckSystem_ProcessUser(t *testing.T) {
tests := []struct {
name string
user string
status inspector.Status
}{
{"correct", "debros", inspector.StatusPass},
{"root", "root", inspector.StatusWarn},
{"other", "ubuntu", inspector.StatusWarn},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{Services: map[string]string{}, ProcessUser: tt.user}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.process_user", tt.status)
})
}
}
func TestCheckSystem_Panics(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{Services: map[string]string{}, PanicCount: 5}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.panics", inspector.StatusFail)
}
func TestCheckSystem_ExpectedPorts(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.System = &inspector.SystemData{
Services: map[string]string{},
ListeningPorts: []int{5001, 6001}, // Missing 3322, 4501
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
expectStatus(t, results, "system.port_5001", inspector.StatusPass)
expectStatus(t, results, "system.port_6001", inspector.StatusPass)
expectStatus(t, results, "system.port_3322", inspector.StatusWarn)
expectStatus(t, results, "system.port_4501", inspector.StatusWarn)
}
func TestCheckSystem_NilData(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckSystem(data)
if len(results) != 0 {
t.Errorf("expected 0 results for nil System data, got %d", len(results))
}
}

View File

@ -0,0 +1,270 @@
package checks
import (
"fmt"
"strings"
"time"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func init() {
inspector.RegisterChecker("wireguard", CheckWireGuard)
}
const wgSub = "wireguard"
// CheckWireGuard runs all WireGuard health checks.
func CheckWireGuard(data *inspector.ClusterData) []inspector.CheckResult {
var results []inspector.CheckResult
for _, nd := range data.Nodes {
if nd.WireGuard == nil {
continue
}
results = append(results, checkWGPerNode(nd, data)...)
}
results = append(results, checkWGCrossNode(data)...)
return results
}
func checkWGPerNode(nd *inspector.NodeData, data *inspector.ClusterData) []inspector.CheckResult {
var r []inspector.CheckResult
wg := nd.WireGuard
node := nd.Node.Name()
// 5.1 Interface up
if wg.InterfaceUp {
r = append(r, inspector.Pass("wg.interface_up", "WireGuard interface up", wgSub, node,
fmt.Sprintf("wg0 up, IP=%s", wg.WgIP), inspector.Critical))
} else {
r = append(r, inspector.Fail("wg.interface_up", "WireGuard interface up", wgSub, node,
"wg0 interface is DOWN", inspector.Critical))
return r
}
// 5.2 Service active
if wg.ServiceActive {
r = append(r, inspector.Pass("wg.service_active", "wg-quick@wg0 service active", wgSub, node,
"service is active", inspector.Critical))
} else {
r = append(r, inspector.Warn("wg.service_active", "wg-quick@wg0 service active", wgSub, node,
"service not active (interface up but service not managed by systemd?)", inspector.High))
}
// 5.5 Correct IP in 10.0.0.0/24
if wg.WgIP != "" && strings.HasPrefix(wg.WgIP, "10.0.0.") {
r = append(r, inspector.Pass("wg.correct_ip", "WG IP in expected range", wgSub, node,
fmt.Sprintf("IP=%s (10.0.0.0/24)", wg.WgIP), inspector.Critical))
} else if wg.WgIP != "" {
r = append(r, inspector.Warn("wg.correct_ip", "WG IP in expected range", wgSub, node,
fmt.Sprintf("IP=%s (not in 10.0.0.0/24)", wg.WgIP), inspector.High))
}
// 5.4 Listen port
if wg.ListenPort == 51820 {
r = append(r, inspector.Pass("wg.listen_port", "Listen port is 51820", wgSub, node,
"port=51820", inspector.Critical))
} else if wg.ListenPort > 0 {
r = append(r, inspector.Warn("wg.listen_port", "Listen port is 51820", wgSub, node,
fmt.Sprintf("port=%d (expected 51820)", wg.ListenPort), inspector.High))
}
// 5.7 Peer count
expectedNodes := countWGNodes(data)
expectedPeers := expectedNodes - 1
if expectedPeers < 0 {
expectedPeers = 0
}
if wg.PeerCount >= expectedPeers {
r = append(r, inspector.Pass("wg.peer_count", "Peer count matches expected", wgSub, node,
fmt.Sprintf("peers=%d (expected=%d)", wg.PeerCount, expectedPeers), inspector.High))
} else if wg.PeerCount > 0 {
r = append(r, inspector.Warn("wg.peer_count", "Peer count matches expected", wgSub, node,
fmt.Sprintf("peers=%d (expected=%d)", wg.PeerCount, expectedPeers), inspector.High))
} else {
r = append(r, inspector.Fail("wg.peer_count", "Peer count matches expected", wgSub, node,
fmt.Sprintf("peers=%d (isolated!)", wg.PeerCount), inspector.Critical))
}
// 5.29 MTU
if wg.MTU == 1420 {
r = append(r, inspector.Pass("wg.mtu", "MTU is 1420", wgSub, node,
"MTU=1420", inspector.High))
} else if wg.MTU > 0 {
r = append(r, inspector.Warn("wg.mtu", "MTU is 1420", wgSub, node,
fmt.Sprintf("MTU=%d (expected 1420)", wg.MTU), inspector.High))
}
// 5.35 Config file exists
if wg.ConfigExists {
r = append(r, inspector.Pass("wg.config_exists", "Config file exists", wgSub, node,
"/etc/wireguard/wg0.conf present", inspector.High))
} else {
r = append(r, inspector.Warn("wg.config_exists", "Config file exists", wgSub, node,
"/etc/wireguard/wg0.conf NOT found", inspector.High))
}
// 5.36 Config permissions
if wg.ConfigPerms == "600" {
r = append(r, inspector.Pass("wg.config_perms", "Config file permissions 600", wgSub, node,
"perms=600", inspector.Critical))
} else if wg.ConfigPerms != "" && wg.ConfigPerms != "000" {
r = append(r, inspector.Warn("wg.config_perms", "Config file permissions 600", wgSub, node,
fmt.Sprintf("perms=%s (expected 600)", wg.ConfigPerms), inspector.Critical))
}
// Per-peer checks
now := time.Now().Unix()
neverHandshaked := 0
staleHandshakes := 0
noTraffic := 0
for _, peer := range wg.Peers {
// 5.20 Each peer has exactly one /32 allowed IP
if !strings.Contains(peer.AllowedIPs, "/32") {
r = append(r, inspector.Warn("wg.peer_allowed_ip", "Peer has /32 allowed IP", wgSub, node,
fmt.Sprintf("peer %s...%s has allowed_ips=%s", peer.PublicKey[:8], peer.PublicKey[len(peer.PublicKey)-4:], peer.AllowedIPs), inspector.High))
}
// 5.23 No peer has 0.0.0.0/0
if strings.Contains(peer.AllowedIPs, "0.0.0.0/0") {
r = append(r, inspector.Fail("wg.peer_catch_all", "No catch-all route peer", wgSub, node,
fmt.Sprintf("peer %s...%s has 0.0.0.0/0 (route hijack!)", peer.PublicKey[:8], peer.PublicKey[len(peer.PublicKey)-4:]), inspector.Critical))
}
// 5.11-5.12 Handshake freshness
if peer.LatestHandshake == 0 {
neverHandshaked++
} else {
age := now - peer.LatestHandshake
if age > 300 {
staleHandshakes++
}
}
// 5.13 Transfer stats
if peer.TransferRx == 0 && peer.TransferTx == 0 {
noTraffic++
}
}
if len(wg.Peers) > 0 {
// 5.12 Never handshaked
if neverHandshaked == 0 {
r = append(r, inspector.Pass("wg.handshake_all", "All peers have handshaked", wgSub, node,
fmt.Sprintf("%d/%d peers handshaked", len(wg.Peers), len(wg.Peers)), inspector.Critical))
} else {
r = append(r, inspector.Fail("wg.handshake_all", "All peers have handshaked", wgSub, node,
fmt.Sprintf("%d/%d peers never handshaked", neverHandshaked, len(wg.Peers)), inspector.Critical))
}
// 5.11 Stale handshakes
if staleHandshakes == 0 {
r = append(r, inspector.Pass("wg.handshake_fresh", "All handshakes recent (<5m)", wgSub, node,
"all handshakes within 5 minutes", inspector.High))
} else {
r = append(r, inspector.Warn("wg.handshake_fresh", "All handshakes recent (<5m)", wgSub, node,
fmt.Sprintf("%d/%d peers with stale handshake (>5m)", staleHandshakes, len(wg.Peers)), inspector.High))
}
// 5.13 Transfer
if noTraffic == 0 {
r = append(r, inspector.Pass("wg.peer_traffic", "All peers have traffic", wgSub, node,
fmt.Sprintf("%d/%d peers with traffic", len(wg.Peers), len(wg.Peers)), inspector.High))
} else {
r = append(r, inspector.Warn("wg.peer_traffic", "All peers have traffic", wgSub, node,
fmt.Sprintf("%d/%d peers with zero traffic", noTraffic, len(wg.Peers)), inspector.High))
}
}
return r
}
func checkWGCrossNode(data *inspector.ClusterData) []inspector.CheckResult {
var r []inspector.CheckResult
type nodeInfo struct {
name string
wg *inspector.WireGuardData
}
var nodes []nodeInfo
for _, nd := range data.Nodes {
if nd.WireGuard != nil && nd.WireGuard.InterfaceUp {
nodes = append(nodes, nodeInfo{name: nd.Node.Name(), wg: nd.WireGuard})
}
}
if len(nodes) < 2 {
return r
}
// 5.8 Peer count consistent
counts := map[int]int{}
for _, n := range nodes {
counts[n.wg.PeerCount]++
}
if len(counts) == 1 {
for c := range counts {
r = append(r, inspector.Pass("wg.peer_count_consistent", "Peer count consistent across nodes", wgSub, "",
fmt.Sprintf("all nodes have %d peers", c), inspector.High))
}
} else {
var parts []string
for c, num := range counts {
parts = append(parts, fmt.Sprintf("%d nodes have %d peers", num, c))
}
r = append(r, inspector.Warn("wg.peer_count_consistent", "Peer count consistent across nodes", wgSub, "",
strings.Join(parts, "; "), inspector.High))
}
// 5.30 MTU consistent
mtus := map[int]int{}
for _, n := range nodes {
if n.wg.MTU > 0 {
mtus[n.wg.MTU]++
}
}
if len(mtus) == 1 {
for m := range mtus {
r = append(r, inspector.Pass("wg.mtu_consistent", "MTU consistent across nodes", wgSub, "",
fmt.Sprintf("all nodes MTU=%d", m), inspector.High))
}
} else if len(mtus) > 1 {
r = append(r, inspector.Warn("wg.mtu_consistent", "MTU consistent across nodes", wgSub, "",
fmt.Sprintf("%d different MTU values", len(mtus)), inspector.High))
}
// 5.50 Public key uniqueness
allKeys := map[string][]string{}
for _, n := range nodes {
for _, peer := range n.wg.Peers {
allKeys[peer.PublicKey] = append(allKeys[peer.PublicKey], n.name)
}
}
dupeKeys := 0
for _, names := range allKeys {
if len(names) > len(nodes)-1 {
dupeKeys++
}
}
// If all good, the same key should appear at most N-1 times (once per other node)
if dupeKeys == 0 {
r = append(r, inspector.Pass("wg.key_uniqueness", "Public keys unique across nodes", wgSub, "",
fmt.Sprintf("%d unique peer keys", len(allKeys)), inspector.Critical))
}
return r
}
func countWGNodes(data *inspector.ClusterData) int {
count := 0
for _, nd := range data.Nodes {
if nd.WireGuard != nil {
count++
}
}
return count
}

View File

@ -0,0 +1,230 @@
package checks
import (
"testing"
"time"
"github.com/DeBrosOfficial/network/pkg/inspector"
)
func TestCheckWireGuard_InterfaceDown(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.WireGuard = &inspector.WireGuardData{InterfaceUp: false}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckWireGuard(data)
expectStatus(t, results, "wg.interface_up", inspector.StatusFail)
// Early return — no further per-node checks
if findCheck(results, "wg.service_active") != nil {
t.Error("should not check service_active when interface down")
}
}
func TestCheckWireGuard_HealthyNode(t *testing.T) {
now := time.Now().Unix()
nd := makeNodeData("1.1.1.1", "node")
nd.WireGuard = &inspector.WireGuardData{
InterfaceUp: true,
ServiceActive: true,
WgIP: "10.0.0.1",
ListenPort: 51820,
PeerCount: 2,
MTU: 1420,
ConfigExists: true,
ConfigPerms: "600",
Peers: []inspector.WGPeer{
{PublicKey: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=", AllowedIPs: "10.0.0.2/32", LatestHandshake: now - 30, TransferRx: 1000, TransferTx: 2000},
{PublicKey: "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB=", AllowedIPs: "10.0.0.3/32", LatestHandshake: now - 60, TransferRx: 500, TransferTx: 800},
},
}
// Single-node for per-node assertions (avoids helper node interference)
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckWireGuard(data)
expectStatus(t, results, "wg.interface_up", inspector.StatusPass)
expectStatus(t, results, "wg.service_active", inspector.StatusPass)
expectStatus(t, results, "wg.correct_ip", inspector.StatusPass)
expectStatus(t, results, "wg.listen_port", inspector.StatusPass)
expectStatus(t, results, "wg.mtu", inspector.StatusPass)
expectStatus(t, results, "wg.config_exists", inspector.StatusPass)
expectStatus(t, results, "wg.config_perms", inspector.StatusPass)
expectStatus(t, results, "wg.handshake_all", inspector.StatusPass)
expectStatus(t, results, "wg.handshake_fresh", inspector.StatusPass)
expectStatus(t, results, "wg.peer_traffic", inspector.StatusPass)
}
func TestCheckWireGuard_WrongIP(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.WireGuard = &inspector.WireGuardData{
InterfaceUp: true,
WgIP: "192.168.1.5",
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckWireGuard(data)
expectStatus(t, results, "wg.correct_ip", inspector.StatusWarn)
}
func TestCheckWireGuard_WrongPort(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.WireGuard = &inspector.WireGuardData{
InterfaceUp: true,
WgIP: "10.0.0.1",
ListenPort: 12345,
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckWireGuard(data)
expectStatus(t, results, "wg.listen_port", inspector.StatusWarn)
}
func TestCheckWireGuard_PeerCountMismatch(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.WireGuard = &inspector.WireGuardData{InterfaceUp: true, WgIP: "10.0.0.1", PeerCount: 1}
nodes := map[string]*inspector.NodeData{"1.1.1.1": nd}
for _, host := range []string{"2.2.2.2", "3.3.3.3", "4.4.4.4"} {
other := makeNodeData(host, "node")
other.WireGuard = &inspector.WireGuardData{InterfaceUp: true, PeerCount: 3}
nodes[host] = other
}
data := makeCluster(nodes)
results := CheckWireGuard(data)
// Node 1.1.1.1 has 1 peer but expects 3 (4 nodes - 1)
c := findCheck(results, "wg.peer_count")
if c == nil {
t.Fatal("expected wg.peer_count check")
}
// At least one node should have a warn
hasWarn := false
for _, r := range results {
if r.ID == "wg.peer_count" && r.Status == inspector.StatusWarn {
hasWarn = true
}
}
if !hasWarn {
t.Error("expected at least one wg.peer_count warn for mismatched peer count")
}
}
func TestCheckWireGuard_ZeroPeers(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.WireGuard = &inspector.WireGuardData{InterfaceUp: true, WgIP: "10.0.0.1", PeerCount: 0}
nodes := map[string]*inspector.NodeData{"1.1.1.1": nd}
for _, host := range []string{"2.2.2.2", "3.3.3.3"} {
other := makeNodeData(host, "node")
other.WireGuard = &inspector.WireGuardData{InterfaceUp: true, PeerCount: 2}
nodes[host] = other
}
data := makeCluster(nodes)
results := CheckWireGuard(data)
// At least one node should fail (zero peers = isolated)
hasFail := false
for _, r := range results {
if r.ID == "wg.peer_count" && r.Status == inspector.StatusFail {
hasFail = true
}
}
if !hasFail {
t.Error("expected wg.peer_count fail for isolated node")
}
}
func TestCheckWireGuard_StaleHandshakes(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.WireGuard = &inspector.WireGuardData{
InterfaceUp: true,
WgIP: "10.0.0.1",
PeerCount: 2,
Peers: []inspector.WGPeer{
{PublicKey: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=", AllowedIPs: "10.0.0.2/32", LatestHandshake: time.Now().Unix() - 600, TransferRx: 100, TransferTx: 200},
{PublicKey: "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB=", AllowedIPs: "10.0.0.3/32", LatestHandshake: time.Now().Unix() - 600, TransferRx: 100, TransferTx: 200},
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckWireGuard(data)
expectStatus(t, results, "wg.handshake_fresh", inspector.StatusWarn)
}
func TestCheckWireGuard_NeverHandshaked(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.WireGuard = &inspector.WireGuardData{
InterfaceUp: true,
WgIP: "10.0.0.1",
PeerCount: 1,
Peers: []inspector.WGPeer{
{PublicKey: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=", AllowedIPs: "10.0.0.2/32", LatestHandshake: 0},
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckWireGuard(data)
expectStatus(t, results, "wg.handshake_all", inspector.StatusFail)
}
func TestCheckWireGuard_NoTraffic(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.WireGuard = &inspector.WireGuardData{
InterfaceUp: true,
WgIP: "10.0.0.1",
PeerCount: 1,
Peers: []inspector.WGPeer{
{PublicKey: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=", AllowedIPs: "10.0.0.2/32", LatestHandshake: time.Now().Unix(), TransferRx: 0, TransferTx: 0},
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckWireGuard(data)
expectStatus(t, results, "wg.peer_traffic", inspector.StatusWarn)
}
func TestCheckWireGuard_CatchAllRoute(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
nd.WireGuard = &inspector.WireGuardData{
InterfaceUp: true,
WgIP: "10.0.0.1",
PeerCount: 1,
Peers: []inspector.WGPeer{
{PublicKey: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=", AllowedIPs: "0.0.0.0/0", LatestHandshake: time.Now().Unix(), TransferRx: 100, TransferTx: 200},
},
}
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckWireGuard(data)
expectStatus(t, results, "wg.peer_catch_all", inspector.StatusFail)
}
func TestCheckWireGuard_CrossNode_PeerCountConsistent(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
for _, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
nd := makeNodeData(host, "node")
nd.WireGuard = &inspector.WireGuardData{InterfaceUp: true, PeerCount: 2, MTU: 1420}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckWireGuard(data)
expectStatus(t, results, "wg.peer_count_consistent", inspector.StatusPass)
expectStatus(t, results, "wg.mtu_consistent", inspector.StatusPass)
}
func TestCheckWireGuard_CrossNode_PeerCountInconsistent(t *testing.T) {
nodes := map[string]*inspector.NodeData{}
counts := []int{2, 2, 1}
for i, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} {
nd := makeNodeData(host, "node")
nd.WireGuard = &inspector.WireGuardData{InterfaceUp: true, PeerCount: counts[i], MTU: 1420}
nodes[host] = nd
}
data := makeCluster(nodes)
results := CheckWireGuard(data)
expectStatus(t, results, "wg.peer_count_consistent", inspector.StatusWarn)
}
func TestCheckWireGuard_NilData(t *testing.T) {
nd := makeNodeData("1.1.1.1", "node")
data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd})
results := CheckWireGuard(data)
if len(results) != 0 {
t.Errorf("expected 0 results for nil WireGuard data, got %d", len(results))
}
}

1268
pkg/inspector/collector.go Normal file

File diff suppressed because it is too large Load Diff

118
pkg/inspector/config.go Normal file
View File

@ -0,0 +1,118 @@
package inspector
import (
"bufio"
"fmt"
"os"
"strings"
)
// Node represents a remote node parsed from remote-nodes.conf.
type Node struct {
Environment string // devnet, testnet
User string // SSH user
Host string // IP or hostname
Password string // SSH password
Role string // node, nameserver-ns1, nameserver-ns2, nameserver-ns3
SSHKey string // optional path to SSH key
}
// Name returns a short display name for the node (user@host).
func (n Node) Name() string {
return fmt.Sprintf("%s@%s", n.User, n.Host)
}
// IsNameserver returns true if the node has a nameserver role.
func (n Node) IsNameserver() bool {
return strings.HasPrefix(n.Role, "nameserver")
}
// LoadNodes parses a remote-nodes.conf file into a slice of Nodes.
// Format: environment|user@host|password|role|ssh_key (ssh_key optional)
func LoadNodes(path string) ([]Node, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("open config: %w", err)
}
defer f.Close()
var nodes []Node
scanner := bufio.NewScanner(f)
lineNum := 0
for scanner.Scan() {
lineNum++
line := strings.TrimSpace(scanner.Text())
if line == "" || strings.HasPrefix(line, "#") {
continue
}
parts := strings.SplitN(line, "|", 5)
if len(parts) < 4 {
return nil, fmt.Errorf("line %d: expected at least 4 pipe-delimited fields, got %d", lineNum, len(parts))
}
env := parts[0]
userHost := parts[1]
password := parts[2]
role := parts[3]
var sshKey string
if len(parts) == 5 {
sshKey = parts[4]
}
// Parse user@host
at := strings.LastIndex(userHost, "@")
if at < 0 {
return nil, fmt.Errorf("line %d: expected user@host format, got %q", lineNum, userHost)
}
user := userHost[:at]
host := userHost[at+1:]
nodes = append(nodes, Node{
Environment: env,
User: user,
Host: host,
Password: password,
Role: role,
SSHKey: sshKey,
})
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("reading config: %w", err)
}
return nodes, nil
}
// FilterByEnv returns only nodes matching the given environment.
func FilterByEnv(nodes []Node, env string) []Node {
var filtered []Node
for _, n := range nodes {
if n.Environment == env {
filtered = append(filtered, n)
}
}
return filtered
}
// FilterByRole returns only nodes matching the given role prefix.
func FilterByRole(nodes []Node, rolePrefix string) []Node {
var filtered []Node
for _, n := range nodes {
if strings.HasPrefix(n.Role, rolePrefix) {
filtered = append(filtered, n)
}
}
return filtered
}
// RegularNodes returns non-nameserver nodes.
func RegularNodes(nodes []Node) []Node {
var filtered []Node
for _, n := range nodes {
if n.Role == "node" {
filtered = append(filtered, n)
}
}
return filtered
}

View File

@ -0,0 +1,179 @@
package inspector
import (
"os"
"path/filepath"
"testing"
)
func TestLoadNodes(t *testing.T) {
content := `# Comment line
devnet|ubuntu@1.2.3.4|pass123|node
devnet|ubuntu@1.2.3.5|pass456|node
devnet|ubuntu@5.6.7.8|pass789|nameserver-ns1|/path/to/key
`
path := writeTempFile(t, content)
nodes, err := LoadNodes(path)
if err != nil {
t.Fatalf("LoadNodes: %v", err)
}
if len(nodes) != 3 {
t.Fatalf("want 3 nodes, got %d", len(nodes))
}
// First node
n := nodes[0]
if n.Environment != "devnet" {
t.Errorf("node[0].Environment = %q, want devnet", n.Environment)
}
if n.User != "ubuntu" {
t.Errorf("node[0].User = %q, want ubuntu", n.User)
}
if n.Host != "1.2.3.4" {
t.Errorf("node[0].Host = %q, want 1.2.3.4", n.Host)
}
if n.Password != "pass123" {
t.Errorf("node[0].Password = %q, want pass123", n.Password)
}
if n.Role != "node" {
t.Errorf("node[0].Role = %q, want node", n.Role)
}
if n.SSHKey != "" {
t.Errorf("node[0].SSHKey = %q, want empty", n.SSHKey)
}
// Third node with SSH key
n3 := nodes[2]
if n3.Role != "nameserver-ns1" {
t.Errorf("node[2].Role = %q, want nameserver-ns1", n3.Role)
}
if n3.SSHKey != "/path/to/key" {
t.Errorf("node[2].SSHKey = %q, want /path/to/key", n3.SSHKey)
}
}
func TestLoadNodes_EmptyLines(t *testing.T) {
content := `
# Full line comment
devnet|ubuntu@1.2.3.4|pass|node
# Another comment
devnet|ubuntu@1.2.3.5|pass|node
`
path := writeTempFile(t, content)
nodes, err := LoadNodes(path)
if err != nil {
t.Fatalf("LoadNodes: %v", err)
}
if len(nodes) != 2 {
t.Fatalf("want 2 nodes (blank/comment lines skipped), got %d", len(nodes))
}
}
func TestLoadNodes_InvalidFormat(t *testing.T) {
tests := []struct {
name string
content string
}{
{"too few fields", "devnet|ubuntu@1.2.3.4|pass\n"},
{"no @ in userhost", "devnet|localhost|pass|node\n"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
path := writeTempFile(t, tt.content)
_, err := LoadNodes(path)
if err == nil {
t.Error("expected error for invalid format")
}
})
}
}
func TestLoadNodes_FileNotFound(t *testing.T) {
_, err := LoadNodes("/nonexistent/path/file.conf")
if err == nil {
t.Error("expected error for nonexistent file")
}
}
func TestFilterByEnv(t *testing.T) {
nodes := []Node{
{Environment: "devnet", Host: "1.1.1.1"},
{Environment: "testnet", Host: "2.2.2.2"},
{Environment: "devnet", Host: "3.3.3.3"},
}
filtered := FilterByEnv(nodes, "devnet")
if len(filtered) != 2 {
t.Fatalf("want 2 devnet nodes, got %d", len(filtered))
}
for _, n := range filtered {
if n.Environment != "devnet" {
t.Errorf("got env=%s, want devnet", n.Environment)
}
}
}
func TestFilterByRole(t *testing.T) {
nodes := []Node{
{Role: "node", Host: "1.1.1.1"},
{Role: "nameserver-ns1", Host: "2.2.2.2"},
{Role: "nameserver-ns2", Host: "3.3.3.3"},
{Role: "node", Host: "4.4.4.4"},
}
filtered := FilterByRole(nodes, "nameserver")
if len(filtered) != 2 {
t.Fatalf("want 2 nameserver nodes, got %d", len(filtered))
}
}
func TestRegularNodes(t *testing.T) {
nodes := []Node{
{Role: "node", Host: "1.1.1.1"},
{Role: "nameserver-ns1", Host: "2.2.2.2"},
{Role: "node", Host: "3.3.3.3"},
}
regular := RegularNodes(nodes)
if len(regular) != 2 {
t.Fatalf("want 2 regular nodes, got %d", len(regular))
}
}
func TestNode_Name(t *testing.T) {
n := Node{User: "ubuntu", Host: "1.2.3.4"}
if got := n.Name(); got != "ubuntu@1.2.3.4" {
t.Errorf("Name() = %q, want ubuntu@1.2.3.4", got)
}
}
func TestNode_IsNameserver(t *testing.T) {
tests := []struct {
role string
want bool
}{
{"nameserver-ns1", true},
{"nameserver-ns2", true},
{"node", false},
{"", false},
}
for _, tt := range tests {
t.Run(tt.role, func(t *testing.T) {
n := Node{Role: tt.role}
if got := n.IsNameserver(); got != tt.want {
t.Errorf("IsNameserver(%q) = %v, want %v", tt.role, got, tt.want)
}
})
}
}
func writeTempFile(t *testing.T, content string) string {
t.Helper()
dir := t.TempDir()
path := filepath.Join(dir, "test-nodes.conf")
if err := os.WriteFile(path, []byte(content), 0644); err != nil {
t.Fatalf("write temp file: %v", err)
}
return path
}

136
pkg/inspector/report.go Normal file
View File

@ -0,0 +1,136 @@
package inspector
import (
"encoding/json"
"fmt"
"io"
"sort"
"strings"
)
// PrintTable writes a human-readable table of check results.
func PrintTable(results *Results, w io.Writer) {
if len(results.Checks) == 0 {
fmt.Fprintf(w, "No checks executed.\n")
return
}
// Sort: failures first, then warnings, then passes, then skips.
// Within each group, sort by severity (critical first).
sorted := make([]CheckResult, len(results.Checks))
copy(sorted, results.Checks)
sort.Slice(sorted, func(i, j int) bool {
oi, oj := statusOrder(sorted[i].Status), statusOrder(sorted[j].Status)
if oi != oj {
return oi < oj
}
// Higher severity first
if sorted[i].Severity != sorted[j].Severity {
return sorted[i].Severity > sorted[j].Severity
}
return sorted[i].ID < sorted[j].ID
})
// Group by subsystem
groups := map[string][]CheckResult{}
var subsystems []string
for _, c := range sorted {
if _, exists := groups[c.Subsystem]; !exists {
subsystems = append(subsystems, c.Subsystem)
}
groups[c.Subsystem] = append(groups[c.Subsystem], c)
}
for _, sub := range subsystems {
checks := groups[sub]
fmt.Fprintf(w, "\n%s %s\n", severityIcon(Critical), strings.ToUpper(sub))
fmt.Fprintf(w, "%s\n", strings.Repeat("-", 70))
for _, c := range checks {
icon := statusIcon(c.Status)
sev := fmt.Sprintf("[%s]", c.Severity)
nodePart := ""
if c.Node != "" {
nodePart = fmt.Sprintf(" (%s)", c.Node)
}
fmt.Fprintf(w, " %s %-8s %s%s\n", icon, sev, c.Name, nodePart)
if c.Message != "" {
fmt.Fprintf(w, " %s\n", c.Message)
}
}
}
passed, failed, warned, skipped := results.Summary()
fmt.Fprintf(w, "\n%s\n", strings.Repeat("=", 70))
fmt.Fprintf(w, "Summary: %d passed, %d failed, %d warnings, %d skipped (%.1fs)\n",
passed, failed, warned, skipped, results.Duration.Seconds())
}
// PrintJSON writes check results as JSON.
func PrintJSON(results *Results, w io.Writer) {
passed, failed, warned, skipped := results.Summary()
output := struct {
Summary struct {
Passed int `json:"passed"`
Failed int `json:"failed"`
Warned int `json:"warned"`
Skipped int `json:"skipped"`
Total int `json:"total"`
Seconds float64 `json:"duration_seconds"`
} `json:"summary"`
Checks []CheckResult `json:"checks"`
}{
Checks: results.Checks,
}
output.Summary.Passed = passed
output.Summary.Failed = failed
output.Summary.Warned = warned
output.Summary.Skipped = skipped
output.Summary.Total = len(results.Checks)
output.Summary.Seconds = results.Duration.Seconds()
enc := json.NewEncoder(w)
enc.SetIndent("", " ")
enc.Encode(output)
}
// SummaryLine returns a one-line summary string.
func SummaryLine(results *Results) string {
passed, failed, warned, skipped := results.Summary()
return fmt.Sprintf("%d passed, %d failed, %d warnings, %d skipped",
passed, failed, warned, skipped)
}
func statusOrder(s Status) int {
switch s {
case StatusFail:
return 0
case StatusWarn:
return 1
case StatusPass:
return 2
case StatusSkip:
return 3
default:
return 4
}
}
func statusIcon(s Status) string {
switch s {
case StatusPass:
return "OK"
case StatusFail:
return "FAIL"
case StatusWarn:
return "WARN"
case StatusSkip:
return "SKIP"
default:
return "??"
}
}
func severityIcon(_ Severity) string {
return "##"
}

View File

@ -0,0 +1,135 @@
package inspector
import (
"bytes"
"encoding/json"
"strings"
"testing"
"time"
)
func TestPrintTable_EmptyResults(t *testing.T) {
r := &Results{}
var buf bytes.Buffer
PrintTable(r, &buf)
if !strings.Contains(buf.String(), "No checks executed") {
t.Errorf("expected 'No checks executed', got %q", buf.String())
}
}
func TestPrintTable_SortsFailuresFirst(t *testing.T) {
r := &Results{
Duration: time.Second,
Checks: []CheckResult{
{ID: "a", Name: "Pass check", Subsystem: "test", Status: StatusPass, Severity: Low},
{ID: "b", Name: "Fail check", Subsystem: "test", Status: StatusFail, Severity: Critical},
{ID: "c", Name: "Warn check", Subsystem: "test", Status: StatusWarn, Severity: High},
},
}
var buf bytes.Buffer
PrintTable(r, &buf)
output := buf.String()
// FAIL should appear before WARN, which should appear before OK
failIdx := strings.Index(output, "FAIL")
warnIdx := strings.Index(output, "WARN")
okIdx := strings.Index(output, "OK")
if failIdx < 0 || warnIdx < 0 || okIdx < 0 {
t.Fatalf("expected FAIL, WARN, and OK in output:\n%s", output)
}
if failIdx > warnIdx {
t.Errorf("FAIL (pos %d) should appear before WARN (pos %d)", failIdx, warnIdx)
}
if warnIdx > okIdx {
t.Errorf("WARN (pos %d) should appear before OK (pos %d)", warnIdx, okIdx)
}
}
func TestPrintTable_IncludesNode(t *testing.T) {
r := &Results{
Duration: time.Second,
Checks: []CheckResult{
{ID: "a", Name: "Check A", Subsystem: "test", Status: StatusPass, Node: "ubuntu@1.2.3.4"},
},
}
var buf bytes.Buffer
PrintTable(r, &buf)
if !strings.Contains(buf.String(), "ubuntu@1.2.3.4") {
t.Error("expected node name in table output")
}
}
func TestPrintTable_IncludesSummary(t *testing.T) {
r := &Results{
Duration: 2 * time.Second,
Checks: []CheckResult{
{ID: "a", Subsystem: "test", Status: StatusPass},
{ID: "b", Subsystem: "test", Status: StatusFail},
},
}
var buf bytes.Buffer
PrintTable(r, &buf)
output := buf.String()
if !strings.Contains(output, "1 passed") {
t.Error("summary should mention passed count")
}
if !strings.Contains(output, "1 failed") {
t.Error("summary should mention failed count")
}
}
func TestPrintJSON_ValidJSON(t *testing.T) {
r := &Results{
Duration: time.Second,
Checks: []CheckResult{
{ID: "a", Name: "A", Subsystem: "test", Status: StatusPass, Severity: Low, Message: "ok"},
{ID: "b", Name: "B", Subsystem: "test", Status: StatusFail, Severity: High, Message: "bad"},
},
}
var buf bytes.Buffer
PrintJSON(r, &buf)
var parsed map[string]interface{}
if err := json.Unmarshal(buf.Bytes(), &parsed); err != nil {
t.Fatalf("output is not valid JSON: %v\nraw: %s", err, buf.String())
}
summary, ok := parsed["summary"].(map[string]interface{})
if !ok {
t.Fatal("missing 'summary' object in JSON")
}
if v := summary["passed"]; v != float64(1) {
t.Errorf("summary.passed = %v, want 1", v)
}
if v := summary["failed"]; v != float64(1) {
t.Errorf("summary.failed = %v, want 1", v)
}
if v := summary["total"]; v != float64(2) {
t.Errorf("summary.total = %v, want 2", v)
}
checks, ok := parsed["checks"].([]interface{})
if !ok {
t.Fatal("missing 'checks' array in JSON")
}
if len(checks) != 2 {
t.Errorf("want 2 checks, got %d", len(checks))
}
}
func TestSummaryLine(t *testing.T) {
r := &Results{
Checks: []CheckResult{
{Status: StatusPass},
{Status: StatusPass},
{Status: StatusFail},
{Status: StatusWarn},
},
}
got := SummaryLine(r)
want := "2 passed, 1 failed, 1 warnings, 0 skipped"
if got != want {
t.Errorf("SummaryLine = %q, want %q", got, want)
}
}

165
pkg/inspector/ssh.go Normal file
View File

@ -0,0 +1,165 @@
package inspector
import (
"bytes"
"context"
"fmt"
"os/exec"
"strings"
"syscall"
"time"
)
const (
sshMaxRetries = 3
sshRetryDelay = 2 * time.Second
)
// SSHResult holds the output of an SSH command execution.
type SSHResult struct {
Stdout string
Stderr string
ExitCode int
Duration time.Duration
Err error
Retries int // how many retries were needed
}
// OK returns true if the command succeeded (exit code 0, no error).
func (r SSHResult) OK() bool {
return r.Err == nil && r.ExitCode == 0
}
// RunSSH executes a command on a remote node via SSH with retry on connection failure.
// Uses sshpass for password auth, falls back to -i for key-based auth.
// The -n flag is used to prevent SSH from reading stdin.
func RunSSH(ctx context.Context, node Node, command string) SSHResult {
var result SSHResult
for attempt := 0; attempt <= sshMaxRetries; attempt++ {
result = runSSHOnce(ctx, node, command)
result.Retries = attempt
// Success — return immediately
if result.OK() {
return result
}
// If the command ran but returned non-zero exit, that's the remote command
// failing (not a connection issue) — don't retry
if result.Err == nil && result.ExitCode != 0 {
return result
}
// Check if it's a connection-level failure worth retrying
if !isSSHConnectionError(result) {
return result
}
// Don't retry if context is done
if ctx.Err() != nil {
return result
}
// Wait before retry (except on last attempt)
if attempt < sshMaxRetries {
select {
case <-time.After(sshRetryDelay):
case <-ctx.Done():
return result
}
}
}
return result
}
// runSSHOnce executes a single SSH attempt.
func runSSHOnce(ctx context.Context, node Node, command string) SSHResult {
start := time.Now()
var args []string
if node.SSHKey != "" {
// Key-based auth
args = []string{
"ssh", "-n",
"-o", "StrictHostKeyChecking=no",
"-o", "ConnectTimeout=10",
"-o", "BatchMode=yes",
"-i", node.SSHKey,
fmt.Sprintf("%s@%s", node.User, node.Host),
command,
}
} else {
// Password auth via sshpass
args = []string{
"sshpass", "-p", node.Password,
"ssh", "-n",
"-o", "StrictHostKeyChecking=no",
"-o", "ConnectTimeout=10",
fmt.Sprintf("%s@%s", node.User, node.Host),
command,
}
}
cmd := exec.CommandContext(ctx, args[0], args[1:]...)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
err := cmd.Run()
duration := time.Since(start)
exitCode := 0
if err != nil {
if exitErr, ok := err.(*exec.ExitError); ok {
if status, ok := exitErr.Sys().(syscall.WaitStatus); ok {
exitCode = status.ExitStatus()
}
}
}
return SSHResult{
Stdout: strings.TrimSpace(stdout.String()),
Stderr: strings.TrimSpace(stderr.String()),
ExitCode: exitCode,
Duration: duration,
Err: err,
}
}
// isSSHConnectionError returns true if the failure looks like an SSH connection
// problem (timeout, refused, network unreachable) rather than a remote command error.
func isSSHConnectionError(r SSHResult) bool {
// sshpass exit code 5 = invalid/incorrect password (not retriable)
// sshpass exit code 6 = host key verification failed (not retriable)
// SSH exit code 255 = SSH connection error (retriable)
if r.ExitCode == 255 {
return true
}
stderr := strings.ToLower(r.Stderr)
connectionErrors := []string{
"connection refused",
"connection timed out",
"connection reset",
"no route to host",
"network is unreachable",
"could not resolve hostname",
"ssh_exchange_identification",
"broken pipe",
"connection closed by remote host",
}
for _, pattern := range connectionErrors {
if strings.Contains(stderr, pattern) {
return true
}
}
return false
}
// RunSSHMulti executes a multi-command string on a remote node.
// Commands are joined with " && " so failure stops execution.
func RunSSHMulti(ctx context.Context, node Node, commands []string) SSHResult {
combined := strings.Join(commands, " && ")
return RunSSH(ctx, node, combined)
}