Updated health check

This commit is contained in:
anonpenguin23 2026-02-10 16:40:01 +02:00
parent 21e82abb65
commit 359fb5ae04
7 changed files with 241 additions and 64 deletions

View File

@ -86,7 +86,7 @@ test-e2e-quick:
.PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill .PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill
VERSION := 0.101.4 VERSION := 0.101.5
COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown) COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown)
DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ) DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)' LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)'

View File

@ -296,26 +296,40 @@ func (c *Client) Health() (*HealthStatus, error) {
c.mu.RLock() c.mu.RLock()
defer c.mu.RUnlock() defer c.mu.RUnlock()
start := time.Now()
status := "healthy" status := "healthy"
if !c.connected { checks := make(map[string]string)
// Connection (real)
if c.connected {
checks["connection"] = "ok"
} else {
checks["connection"] = "disconnected"
status = "unhealthy" status = "unhealthy"
} }
checks := map[string]string{ // LibP2P peers (real)
"connection": "ok", if c.host != nil {
"database": "ok", checks["peers"] = fmt.Sprintf("%d", len(c.host.Network().Peers()))
"pubsub": "ok", } else {
checks["peers"] = "0"
} }
if !c.connected { // PubSub (real — check if adapter was initialized)
checks["connection"] = "disconnected" if c.pubsub != nil && c.pubsub.adapter != nil {
checks["pubsub"] = "ok"
} else {
checks["pubsub"] = "unavailable"
if status == "healthy" {
status = "degraded"
}
} }
return &HealthStatus{ return &HealthStatus{
Status: status, Status: status,
Checks: checks, Checks: checks,
LastUpdated: time.Now(), LastUpdated: time.Now(),
ResponseTime: time.Millisecond * 10, // Simulated ResponseTime: time.Since(start),
}, nil }, nil
} }

View File

@ -174,7 +174,7 @@ func TestHealth(t *testing.T) {
cfg := &ClientConfig{AppName: "app"} cfg := &ClientConfig{AppName: "app"}
c := &Client{config: cfg} c := &Client{config: cfg}
// default disconnected // default disconnected → unhealthy
h, err := c.Health() h, err := c.Health()
if err != nil { if err != nil {
t.Fatalf("unexpected error: %v", err) t.Fatalf("unexpected error: %v", err)
@ -183,10 +183,17 @@ func TestHealth(t *testing.T) {
t.Fatalf("expected unhealthy when not connected, got %q", h.Status) t.Fatalf("expected unhealthy when not connected, got %q", h.Status)
} }
// mark connected // connected but no pubsub → degraded (pubsub not initialized)
c.connected = true c.connected = true
h2, _ := c.Health() h2, _ := c.Health()
if h2.Status != "healthy" { if h2.Status != "degraded" {
t.Fatalf("expected healthy when connected, got %q", h2.Status) t.Fatalf("expected degraded when connected without pubsub, got %q", h2.Status)
}
// connected with pubsub → healthy
c.pubsub = &pubSubBridge{client: c, adapter: &pubsub.ClientAdapter{}}
h3, _ := c.Health()
if h3.Status != "healthy" {
t.Fatalf("expected healthy when fully connected, got %q", h3.Status)
} }
} }

View File

@ -46,8 +46,9 @@ type Gateway struct {
logger *logging.ColoredLogger logger *logging.ColoredLogger
cfg *Config cfg *Config
client client.NetworkClient client client.NetworkClient
nodePeerID string // The node's actual peer ID from its identity file (overrides client's peer ID) nodePeerID string // The node's actual peer ID from its identity file (overrides client's peer ID)
startedAt time.Time localWireGuardIP string // WireGuard IP of this node, used to prefer local namespace gateways
startedAt time.Time
// rqlite SQL connection and HTTP ORM gateway // rqlite SQL connection and HTTP ORM gateway
sqlDB *sql.DB sqlDB *sql.DB
@ -62,6 +63,10 @@ type Gateway struct {
olricMu sync.RWMutex olricMu sync.RWMutex
cacheHandlers *cache.CacheHandlers cacheHandlers *cache.CacheHandlers
// Health check result cache (5s TTL)
healthCacheMu sync.RWMutex
healthCache *cachedHealthResult
// IPFS storage client // IPFS storage client
ipfsClient ipfs.IPFSClient ipfsClient ipfs.IPFSClient
storageHandlers *storage.Handlers storageHandlers *storage.Handlers
@ -251,6 +256,16 @@ func New(logger *logging.ColoredLogger, cfg *Config) (*Gateway, error) {
presenceMembers: make(map[string][]PresenceMember), presenceMembers: make(map[string][]PresenceMember),
} }
// Resolve local WireGuard IP for local namespace gateway preference
if wgIP, err := GetWireGuardIP(); err == nil {
gw.localWireGuardIP = wgIP
logger.ComponentInfo(logging.ComponentGeneral, "Detected local WireGuard IP for gateway routing",
zap.String("wireguard_ip", wgIP))
} else {
logger.ComponentWarn(logging.ComponentGeneral, "Could not detect WireGuard IP, local gateway preference disabled",
zap.Error(err))
}
// Create separate auth client for global RQLite if GlobalRQLiteDSN is provided // Create separate auth client for global RQLite if GlobalRQLiteDSN is provided
// This allows namespace gateways to validate API keys against the global database // This allows namespace gateways to validate API keys against the global database
if cfg.GlobalRQLiteDSN != "" && cfg.GlobalRQLiteDSN != cfg.RQLiteDSN { if cfg.GlobalRQLiteDSN != "" && cfg.GlobalRQLiteDSN != cfg.RQLiteDSN {

View File

@ -913,18 +913,34 @@ func (g *Gateway) handleNamespaceGatewayRequest(w http.ResponseWriter, r *http.R
return targets[i].ip < targets[j].ip return targets[i].ip < targets[j].ip
}) })
affinityKey := namespaceName + "|" + validatedNamespace // Prefer local gateway if this node is part of the namespace cluster.
if apiKey := extractAPIKey(r); apiKey != "" { // This avoids a WireGuard network hop and eliminates single-point-of-failure
affinityKey = namespaceName + "|" + apiKey // when a remote gateway node is down.
} else if authz := strings.TrimSpace(r.Header.Get("Authorization")); authz != "" { var selected namespaceGatewayTarget
affinityKey = namespaceName + "|" + authz if g.localWireGuardIP != "" {
} else { for _, t := range targets {
affinityKey = namespaceName + "|" + getClientIP(r) if t.ip == g.localWireGuardIP {
selected = t
break
}
}
}
// Fall back to consistent hashing for nodes not in the namespace cluster
if selected.ip == "" {
affinityKey := namespaceName + "|" + validatedNamespace
if apiKey := extractAPIKey(r); apiKey != "" {
affinityKey = namespaceName + "|" + apiKey
} else if authz := strings.TrimSpace(r.Header.Get("Authorization")); authz != "" {
affinityKey = namespaceName + "|" + authz
} else {
affinityKey = namespaceName + "|" + getClientIP(r)
}
hasher := fnv.New32a()
_, _ = hasher.Write([]byte(affinityKey))
targetIdx := int(hasher.Sum32()) % len(targets)
selected = targets[targetIdx]
} }
hasher := fnv.New32a()
_, _ = hasher.Write([]byte(affinityKey))
targetIdx := int(hasher.Sum32()) % len(targets)
selected := targets[targetIdx]
gatewayIP := selected.ip gatewayIP := selected.ip
gatewayPort := selected.port gatewayPort := selected.port
targetHost := gatewayIP + ":" + strconv.Itoa(gatewayPort) targetHost := gatewayIP + ":" + strconv.Itoa(gatewayPort)

View File

@ -336,16 +336,15 @@ func (pd *PeerDiscovery) updateHeartbeat(ctx context.Context) error {
return nil return nil
} }
// getWireGuardIP extracts the WireGuard IP from the WireGuard interface // GetWireGuardIP detects the local WireGuard IP address using the wg0 network
func (pd *PeerDiscovery) getWireGuardIP() (string, error) { // interface or the WireGuard config file. It does not require a PeerDiscovery
// instance and can be called from anywhere in the gateway package.
func GetWireGuardIP() (string, error) {
// Method 1: Use 'ip addr show wg0' command (works without root) // Method 1: Use 'ip addr show wg0' command (works without root)
ip, err := pd.getWireGuardIPFromInterface() ip, err := getWireGuardIPFromCommand()
if err == nil { if err == nil {
pd.logger.Info("Found WireGuard IP from network interface",
zap.String("ip", ip))
return ip, nil return ip, nil
} }
pd.logger.Debug("Failed to get WireGuard IP from interface", zap.Error(err))
// Method 2: Try to read from WireGuard config file (requires root, may fail) // Method 2: Try to read from WireGuard config file (requires root, may fail)
configPath := "/etc/wireguard/wg0.conf" configPath := "/etc/wireguard/wg0.conf"
@ -363,14 +362,24 @@ func (pd *PeerDiscovery) getWireGuardIP() (string, error) {
// Remove /24 suffix // Remove /24 suffix
ip := strings.Split(addrWithCIDR, "/")[0] ip := strings.Split(addrWithCIDR, "/")[0]
ip = strings.TrimSpace(ip) ip = strings.TrimSpace(ip)
pd.logger.Info("Found WireGuard IP from config",
zap.String("ip", ip))
return ip, nil return ip, nil
} }
} }
} }
} }
pd.logger.Debug("Failed to read WireGuard config", zap.Error(err))
return "", fmt.Errorf("could not determine WireGuard IP")
}
// getWireGuardIP extracts the WireGuard IP from the WireGuard interface
func (pd *PeerDiscovery) getWireGuardIP() (string, error) {
// Try the standalone methods first (interface + config file)
ip, err := GetWireGuardIP()
if err == nil {
pd.logger.Info("Found WireGuard IP", zap.String("ip", ip))
return ip, nil
}
pd.logger.Debug("Failed to get WireGuard IP from interface/config", zap.Error(err))
// Method 3: Fallback - Try to get from libp2p host addresses // Method 3: Fallback - Try to get from libp2p host addresses
for _, addr := range pd.host.Addrs() { for _, addr := range pd.host.Addrs() {
@ -400,8 +409,8 @@ func (pd *PeerDiscovery) getWireGuardIP() (string, error) {
return "", fmt.Errorf("could not determine WireGuard IP") return "", fmt.Errorf("could not determine WireGuard IP")
} }
// getWireGuardIPFromInterface gets the WireGuard IP using 'ip addr show wg0' // getWireGuardIPFromCommand gets the WireGuard IP using 'ip addr show wg0'
func (pd *PeerDiscovery) getWireGuardIPFromInterface() (string, error) { func getWireGuardIPFromCommand() (string, error) {
cmd := exec.Command("ip", "addr", "show", "wg0") cmd := exec.Command("ip", "addr", "show", "wg0")
output, err := cmd.Output() output, err := cmd.Output()
if err != nil { if err != nil {

View File

@ -1,14 +1,12 @@
package gateway package gateway
import ( import (
"encoding/json" "context"
"net/http" "net/http"
"strings" "strings"
"time" "time"
"github.com/DeBrosOfficial/network/pkg/client" "github.com/DeBrosOfficial/network/pkg/client"
"github.com/DeBrosOfficial/network/pkg/logging"
"go.uber.org/zap"
) )
// Build info (set via -ldflags at build time; defaults for dev) // Build info (set via -ldflags at build time; defaults for dev)
@ -18,41 +16,159 @@ var (
BuildTime = "" BuildTime = ""
) )
// healthResponse is the JSON structure used by healthHandler // checkResult holds the result of a single subsystem health check.
type healthResponse struct { type checkResult struct {
Status string `json:"status"` Status string `json:"status"` // "ok", "error", "unavailable"
StartedAt time.Time `json:"started_at"` Latency string `json:"latency,omitempty"` // e.g. "1.2ms"
Uptime string `json:"uptime"` Error string `json:"error,omitempty"` // set when Status == "error"
Peers int `json:"peers,omitempty"` // libp2p peer count
} }
// cachedHealthResult caches the aggregate health response for 5 seconds.
type cachedHealthResult struct {
response any
httpStatus int
cachedAt time.Time
}
const healthCacheTTL = 5 * time.Second
func (g *Gateway) healthHandler(w http.ResponseWriter, r *http.Request) { func (g *Gateway) healthHandler(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json") // Serve from cache if fresh
server := healthResponse{ g.healthCacheMu.RLock()
Status: "ok", cached := g.healthCache
StartedAt: g.startedAt, g.healthCacheMu.RUnlock()
Uptime: time.Since(g.startedAt).String(), if cached != nil && time.Since(cached.cachedAt) < healthCacheTTL {
writeJSON(w, cached.httpStatus, cached.response)
return
} }
var clientHealth *client.HealthStatus // Run all checks in parallel with a shared 5s timeout
if g.client != nil { ctx, cancel := context.WithTimeout(r.Context(), 5*time.Second)
if h, err := g.client.Health(); err == nil { defer cancel()
clientHealth = h
type namedResult struct {
name string
result checkResult
}
ch := make(chan namedResult, 4)
// RQLite
go func() {
nr := namedResult{name: "rqlite"}
if g.sqlDB == nil {
nr.result = checkResult{Status: "unavailable"}
} else { } else {
g.logger.ComponentWarn(logging.ComponentClient, "failed to fetch client health", zap.Error(err)) start := time.Now()
if err := g.sqlDB.PingContext(ctx); err != nil {
nr.result = checkResult{Status: "error", Latency: time.Since(start).String(), Error: err.Error()}
} else {
nr.result = checkResult{Status: "ok", Latency: time.Since(start).String()}
}
}
ch <- nr
}()
// Olric (thread-safe: can be nil or reconnected in background)
go func() {
nr := namedResult{name: "olric"}
g.olricMu.RLock()
oc := g.olricClient
g.olricMu.RUnlock()
if oc == nil {
nr.result = checkResult{Status: "unavailable"}
} else {
start := time.Now()
if err := oc.Health(ctx); err != nil {
nr.result = checkResult{Status: "error", Latency: time.Since(start).String(), Error: err.Error()}
} else {
nr.result = checkResult{Status: "ok", Latency: time.Since(start).String()}
}
}
ch <- nr
}()
// IPFS
go func() {
nr := namedResult{name: "ipfs"}
if g.ipfsClient == nil {
nr.result = checkResult{Status: "unavailable"}
} else {
start := time.Now()
if err := g.ipfsClient.Health(ctx); err != nil {
nr.result = checkResult{Status: "error", Latency: time.Since(start).String(), Error: err.Error()}
} else {
nr.result = checkResult{Status: "ok", Latency: time.Since(start).String()}
}
}
ch <- nr
}()
// LibP2P
go func() {
nr := namedResult{name: "libp2p"}
if g.client == nil {
nr.result = checkResult{Status: "unavailable"}
} else if h := g.client.Host(); h == nil {
nr.result = checkResult{Status: "unavailable"}
} else {
peers := len(h.Network().Peers())
nr.result = checkResult{Status: "ok", Peers: peers}
}
ch <- nr
}()
// Collect
checks := make(map[string]checkResult, 4)
for i := 0; i < 4; i++ {
nr := <-ch
checks[nr.name] = nr.result
}
// Aggregate status.
// Critical: rqlite down → "unhealthy"
// Non-critical (olric, ipfs, libp2p) error → "degraded"
// "unavailable" means the client was never configured — not an error.
overallStatus := "healthy"
if c := checks["rqlite"]; c.Status == "error" {
overallStatus = "unhealthy"
}
if overallStatus == "healthy" {
for name, c := range checks {
if name == "rqlite" {
continue
}
if c.Status == "error" {
overallStatus = "degraded"
break
}
} }
} }
resp := struct { httpStatus := http.StatusOK
Status string `json:"status"` if overallStatus != "healthy" {
Server healthResponse `json:"server"` httpStatus = http.StatusServiceUnavailable
Client *client.HealthStatus `json:"client"`
}{
Status: "ok",
Server: server,
Client: clientHealth,
} }
_ = json.NewEncoder(w).Encode(resp) resp := map[string]any{
"status": overallStatus,
"server": map[string]any{
"started_at": g.startedAt,
"uptime": time.Since(g.startedAt).String(),
},
"checks": checks,
}
// Cache
g.healthCacheMu.Lock()
g.healthCache = &cachedHealthResult{
response: resp,
httpStatus: httpStatus,
cachedAt: time.Now(),
}
g.healthCacheMu.Unlock()
writeJSON(w, httpStatus, resp)
} }
// statusHandler aggregates server uptime and network status // statusHandler aggregates server uptime and network status