mirror of
https://github.com/DeBrosOfficial/orama.git
synced 2026-03-17 08:36:57 +00:00
Updated health check
This commit is contained in:
parent
21e82abb65
commit
359fb5ae04
2
Makefile
2
Makefile
@ -86,7 +86,7 @@ test-e2e-quick:
|
|||||||
|
|
||||||
.PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill
|
.PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill
|
||||||
|
|
||||||
VERSION := 0.101.4
|
VERSION := 0.101.5
|
||||||
COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown)
|
COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown)
|
||||||
DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
|
DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||||
LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)'
|
LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)'
|
||||||
|
|||||||
@ -296,26 +296,40 @@ func (c *Client) Health() (*HealthStatus, error) {
|
|||||||
c.mu.RLock()
|
c.mu.RLock()
|
||||||
defer c.mu.RUnlock()
|
defer c.mu.RUnlock()
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
status := "healthy"
|
status := "healthy"
|
||||||
if !c.connected {
|
checks := make(map[string]string)
|
||||||
|
|
||||||
|
// Connection (real)
|
||||||
|
if c.connected {
|
||||||
|
checks["connection"] = "ok"
|
||||||
|
} else {
|
||||||
|
checks["connection"] = "disconnected"
|
||||||
status = "unhealthy"
|
status = "unhealthy"
|
||||||
}
|
}
|
||||||
|
|
||||||
checks := map[string]string{
|
// LibP2P peers (real)
|
||||||
"connection": "ok",
|
if c.host != nil {
|
||||||
"database": "ok",
|
checks["peers"] = fmt.Sprintf("%d", len(c.host.Network().Peers()))
|
||||||
"pubsub": "ok",
|
} else {
|
||||||
|
checks["peers"] = "0"
|
||||||
}
|
}
|
||||||
|
|
||||||
if !c.connected {
|
// PubSub (real — check if adapter was initialized)
|
||||||
checks["connection"] = "disconnected"
|
if c.pubsub != nil && c.pubsub.adapter != nil {
|
||||||
|
checks["pubsub"] = "ok"
|
||||||
|
} else {
|
||||||
|
checks["pubsub"] = "unavailable"
|
||||||
|
if status == "healthy" {
|
||||||
|
status = "degraded"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return &HealthStatus{
|
return &HealthStatus{
|
||||||
Status: status,
|
Status: status,
|
||||||
Checks: checks,
|
Checks: checks,
|
||||||
LastUpdated: time.Now(),
|
LastUpdated: time.Now(),
|
||||||
ResponseTime: time.Millisecond * 10, // Simulated
|
ResponseTime: time.Since(start),
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -174,7 +174,7 @@ func TestHealth(t *testing.T) {
|
|||||||
cfg := &ClientConfig{AppName: "app"}
|
cfg := &ClientConfig{AppName: "app"}
|
||||||
c := &Client{config: cfg}
|
c := &Client{config: cfg}
|
||||||
|
|
||||||
// default disconnected
|
// default disconnected → unhealthy
|
||||||
h, err := c.Health()
|
h, err := c.Health()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("unexpected error: %v", err)
|
t.Fatalf("unexpected error: %v", err)
|
||||||
@ -183,10 +183,17 @@ func TestHealth(t *testing.T) {
|
|||||||
t.Fatalf("expected unhealthy when not connected, got %q", h.Status)
|
t.Fatalf("expected unhealthy when not connected, got %q", h.Status)
|
||||||
}
|
}
|
||||||
|
|
||||||
// mark connected
|
// connected but no pubsub → degraded (pubsub not initialized)
|
||||||
c.connected = true
|
c.connected = true
|
||||||
h2, _ := c.Health()
|
h2, _ := c.Health()
|
||||||
if h2.Status != "healthy" {
|
if h2.Status != "degraded" {
|
||||||
t.Fatalf("expected healthy when connected, got %q", h2.Status)
|
t.Fatalf("expected degraded when connected without pubsub, got %q", h2.Status)
|
||||||
|
}
|
||||||
|
|
||||||
|
// connected with pubsub → healthy
|
||||||
|
c.pubsub = &pubSubBridge{client: c, adapter: &pubsub.ClientAdapter{}}
|
||||||
|
h3, _ := c.Health()
|
||||||
|
if h3.Status != "healthy" {
|
||||||
|
t.Fatalf("expected healthy when fully connected, got %q", h3.Status)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -47,6 +47,7 @@ type Gateway struct {
|
|||||||
cfg *Config
|
cfg *Config
|
||||||
client client.NetworkClient
|
client client.NetworkClient
|
||||||
nodePeerID string // The node's actual peer ID from its identity file (overrides client's peer ID)
|
nodePeerID string // The node's actual peer ID from its identity file (overrides client's peer ID)
|
||||||
|
localWireGuardIP string // WireGuard IP of this node, used to prefer local namespace gateways
|
||||||
startedAt time.Time
|
startedAt time.Time
|
||||||
|
|
||||||
// rqlite SQL connection and HTTP ORM gateway
|
// rqlite SQL connection and HTTP ORM gateway
|
||||||
@ -62,6 +63,10 @@ type Gateway struct {
|
|||||||
olricMu sync.RWMutex
|
olricMu sync.RWMutex
|
||||||
cacheHandlers *cache.CacheHandlers
|
cacheHandlers *cache.CacheHandlers
|
||||||
|
|
||||||
|
// Health check result cache (5s TTL)
|
||||||
|
healthCacheMu sync.RWMutex
|
||||||
|
healthCache *cachedHealthResult
|
||||||
|
|
||||||
// IPFS storage client
|
// IPFS storage client
|
||||||
ipfsClient ipfs.IPFSClient
|
ipfsClient ipfs.IPFSClient
|
||||||
storageHandlers *storage.Handlers
|
storageHandlers *storage.Handlers
|
||||||
@ -251,6 +256,16 @@ func New(logger *logging.ColoredLogger, cfg *Config) (*Gateway, error) {
|
|||||||
presenceMembers: make(map[string][]PresenceMember),
|
presenceMembers: make(map[string][]PresenceMember),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Resolve local WireGuard IP for local namespace gateway preference
|
||||||
|
if wgIP, err := GetWireGuardIP(); err == nil {
|
||||||
|
gw.localWireGuardIP = wgIP
|
||||||
|
logger.ComponentInfo(logging.ComponentGeneral, "Detected local WireGuard IP for gateway routing",
|
||||||
|
zap.String("wireguard_ip", wgIP))
|
||||||
|
} else {
|
||||||
|
logger.ComponentWarn(logging.ComponentGeneral, "Could not detect WireGuard IP, local gateway preference disabled",
|
||||||
|
zap.Error(err))
|
||||||
|
}
|
||||||
|
|
||||||
// Create separate auth client for global RQLite if GlobalRQLiteDSN is provided
|
// Create separate auth client for global RQLite if GlobalRQLiteDSN is provided
|
||||||
// This allows namespace gateways to validate API keys against the global database
|
// This allows namespace gateways to validate API keys against the global database
|
||||||
if cfg.GlobalRQLiteDSN != "" && cfg.GlobalRQLiteDSN != cfg.RQLiteDSN {
|
if cfg.GlobalRQLiteDSN != "" && cfg.GlobalRQLiteDSN != cfg.RQLiteDSN {
|
||||||
|
|||||||
@ -913,6 +913,21 @@ func (g *Gateway) handleNamespaceGatewayRequest(w http.ResponseWriter, r *http.R
|
|||||||
return targets[i].ip < targets[j].ip
|
return targets[i].ip < targets[j].ip
|
||||||
})
|
})
|
||||||
|
|
||||||
|
// Prefer local gateway if this node is part of the namespace cluster.
|
||||||
|
// This avoids a WireGuard network hop and eliminates single-point-of-failure
|
||||||
|
// when a remote gateway node is down.
|
||||||
|
var selected namespaceGatewayTarget
|
||||||
|
if g.localWireGuardIP != "" {
|
||||||
|
for _, t := range targets {
|
||||||
|
if t.ip == g.localWireGuardIP {
|
||||||
|
selected = t
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall back to consistent hashing for nodes not in the namespace cluster
|
||||||
|
if selected.ip == "" {
|
||||||
affinityKey := namespaceName + "|" + validatedNamespace
|
affinityKey := namespaceName + "|" + validatedNamespace
|
||||||
if apiKey := extractAPIKey(r); apiKey != "" {
|
if apiKey := extractAPIKey(r); apiKey != "" {
|
||||||
affinityKey = namespaceName + "|" + apiKey
|
affinityKey = namespaceName + "|" + apiKey
|
||||||
@ -924,7 +939,8 @@ func (g *Gateway) handleNamespaceGatewayRequest(w http.ResponseWriter, r *http.R
|
|||||||
hasher := fnv.New32a()
|
hasher := fnv.New32a()
|
||||||
_, _ = hasher.Write([]byte(affinityKey))
|
_, _ = hasher.Write([]byte(affinityKey))
|
||||||
targetIdx := int(hasher.Sum32()) % len(targets)
|
targetIdx := int(hasher.Sum32()) % len(targets)
|
||||||
selected := targets[targetIdx]
|
selected = targets[targetIdx]
|
||||||
|
}
|
||||||
gatewayIP := selected.ip
|
gatewayIP := selected.ip
|
||||||
gatewayPort := selected.port
|
gatewayPort := selected.port
|
||||||
targetHost := gatewayIP + ":" + strconv.Itoa(gatewayPort)
|
targetHost := gatewayIP + ":" + strconv.Itoa(gatewayPort)
|
||||||
|
|||||||
@ -336,16 +336,15 @@ func (pd *PeerDiscovery) updateHeartbeat(ctx context.Context) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// getWireGuardIP extracts the WireGuard IP from the WireGuard interface
|
// GetWireGuardIP detects the local WireGuard IP address using the wg0 network
|
||||||
func (pd *PeerDiscovery) getWireGuardIP() (string, error) {
|
// interface or the WireGuard config file. It does not require a PeerDiscovery
|
||||||
|
// instance and can be called from anywhere in the gateway package.
|
||||||
|
func GetWireGuardIP() (string, error) {
|
||||||
// Method 1: Use 'ip addr show wg0' command (works without root)
|
// Method 1: Use 'ip addr show wg0' command (works without root)
|
||||||
ip, err := pd.getWireGuardIPFromInterface()
|
ip, err := getWireGuardIPFromCommand()
|
||||||
if err == nil {
|
if err == nil {
|
||||||
pd.logger.Info("Found WireGuard IP from network interface",
|
|
||||||
zap.String("ip", ip))
|
|
||||||
return ip, nil
|
return ip, nil
|
||||||
}
|
}
|
||||||
pd.logger.Debug("Failed to get WireGuard IP from interface", zap.Error(err))
|
|
||||||
|
|
||||||
// Method 2: Try to read from WireGuard config file (requires root, may fail)
|
// Method 2: Try to read from WireGuard config file (requires root, may fail)
|
||||||
configPath := "/etc/wireguard/wg0.conf"
|
configPath := "/etc/wireguard/wg0.conf"
|
||||||
@ -363,14 +362,24 @@ func (pd *PeerDiscovery) getWireGuardIP() (string, error) {
|
|||||||
// Remove /24 suffix
|
// Remove /24 suffix
|
||||||
ip := strings.Split(addrWithCIDR, "/")[0]
|
ip := strings.Split(addrWithCIDR, "/")[0]
|
||||||
ip = strings.TrimSpace(ip)
|
ip = strings.TrimSpace(ip)
|
||||||
pd.logger.Info("Found WireGuard IP from config",
|
|
||||||
zap.String("ip", ip))
|
|
||||||
return ip, nil
|
return ip, nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pd.logger.Debug("Failed to read WireGuard config", zap.Error(err))
|
|
||||||
|
return "", fmt.Errorf("could not determine WireGuard IP")
|
||||||
|
}
|
||||||
|
|
||||||
|
// getWireGuardIP extracts the WireGuard IP from the WireGuard interface
|
||||||
|
func (pd *PeerDiscovery) getWireGuardIP() (string, error) {
|
||||||
|
// Try the standalone methods first (interface + config file)
|
||||||
|
ip, err := GetWireGuardIP()
|
||||||
|
if err == nil {
|
||||||
|
pd.logger.Info("Found WireGuard IP", zap.String("ip", ip))
|
||||||
|
return ip, nil
|
||||||
|
}
|
||||||
|
pd.logger.Debug("Failed to get WireGuard IP from interface/config", zap.Error(err))
|
||||||
|
|
||||||
// Method 3: Fallback - Try to get from libp2p host addresses
|
// Method 3: Fallback - Try to get from libp2p host addresses
|
||||||
for _, addr := range pd.host.Addrs() {
|
for _, addr := range pd.host.Addrs() {
|
||||||
@ -400,8 +409,8 @@ func (pd *PeerDiscovery) getWireGuardIP() (string, error) {
|
|||||||
return "", fmt.Errorf("could not determine WireGuard IP")
|
return "", fmt.Errorf("could not determine WireGuard IP")
|
||||||
}
|
}
|
||||||
|
|
||||||
// getWireGuardIPFromInterface gets the WireGuard IP using 'ip addr show wg0'
|
// getWireGuardIPFromCommand gets the WireGuard IP using 'ip addr show wg0'
|
||||||
func (pd *PeerDiscovery) getWireGuardIPFromInterface() (string, error) {
|
func getWireGuardIPFromCommand() (string, error) {
|
||||||
cmd := exec.Command("ip", "addr", "show", "wg0")
|
cmd := exec.Command("ip", "addr", "show", "wg0")
|
||||||
output, err := cmd.Output()
|
output, err := cmd.Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@ -1,14 +1,12 @@
|
|||||||
package gateway
|
package gateway
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"context"
|
||||||
"net/http"
|
"net/http"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/DeBrosOfficial/network/pkg/client"
|
"github.com/DeBrosOfficial/network/pkg/client"
|
||||||
"github.com/DeBrosOfficial/network/pkg/logging"
|
|
||||||
"go.uber.org/zap"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// Build info (set via -ldflags at build time; defaults for dev)
|
// Build info (set via -ldflags at build time; defaults for dev)
|
||||||
@ -18,41 +16,159 @@ var (
|
|||||||
BuildTime = ""
|
BuildTime = ""
|
||||||
)
|
)
|
||||||
|
|
||||||
// healthResponse is the JSON structure used by healthHandler
|
// checkResult holds the result of a single subsystem health check.
|
||||||
type healthResponse struct {
|
type checkResult struct {
|
||||||
Status string `json:"status"`
|
Status string `json:"status"` // "ok", "error", "unavailable"
|
||||||
StartedAt time.Time `json:"started_at"`
|
Latency string `json:"latency,omitempty"` // e.g. "1.2ms"
|
||||||
Uptime string `json:"uptime"`
|
Error string `json:"error,omitempty"` // set when Status == "error"
|
||||||
|
Peers int `json:"peers,omitempty"` // libp2p peer count
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// cachedHealthResult caches the aggregate health response for 5 seconds.
|
||||||
|
type cachedHealthResult struct {
|
||||||
|
response any
|
||||||
|
httpStatus int
|
||||||
|
cachedAt time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
const healthCacheTTL = 5 * time.Second
|
||||||
|
|
||||||
func (g *Gateway) healthHandler(w http.ResponseWriter, r *http.Request) {
|
func (g *Gateway) healthHandler(w http.ResponseWriter, r *http.Request) {
|
||||||
w.Header().Set("Content-Type", "application/json")
|
// Serve from cache if fresh
|
||||||
server := healthResponse{
|
g.healthCacheMu.RLock()
|
||||||
Status: "ok",
|
cached := g.healthCache
|
||||||
StartedAt: g.startedAt,
|
g.healthCacheMu.RUnlock()
|
||||||
Uptime: time.Since(g.startedAt).String(),
|
if cached != nil && time.Since(cached.cachedAt) < healthCacheTTL {
|
||||||
|
writeJSON(w, cached.httpStatus, cached.response)
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
var clientHealth *client.HealthStatus
|
// Run all checks in parallel with a shared 5s timeout
|
||||||
if g.client != nil {
|
ctx, cancel := context.WithTimeout(r.Context(), 5*time.Second)
|
||||||
if h, err := g.client.Health(); err == nil {
|
defer cancel()
|
||||||
clientHealth = h
|
|
||||||
|
type namedResult struct {
|
||||||
|
name string
|
||||||
|
result checkResult
|
||||||
|
}
|
||||||
|
ch := make(chan namedResult, 4)
|
||||||
|
|
||||||
|
// RQLite
|
||||||
|
go func() {
|
||||||
|
nr := namedResult{name: "rqlite"}
|
||||||
|
if g.sqlDB == nil {
|
||||||
|
nr.result = checkResult{Status: "unavailable"}
|
||||||
} else {
|
} else {
|
||||||
g.logger.ComponentWarn(logging.ComponentClient, "failed to fetch client health", zap.Error(err))
|
start := time.Now()
|
||||||
|
if err := g.sqlDB.PingContext(ctx); err != nil {
|
||||||
|
nr.result = checkResult{Status: "error", Latency: time.Since(start).String(), Error: err.Error()}
|
||||||
|
} else {
|
||||||
|
nr.result = checkResult{Status: "ok", Latency: time.Since(start).String()}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ch <- nr
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Olric (thread-safe: can be nil or reconnected in background)
|
||||||
|
go func() {
|
||||||
|
nr := namedResult{name: "olric"}
|
||||||
|
g.olricMu.RLock()
|
||||||
|
oc := g.olricClient
|
||||||
|
g.olricMu.RUnlock()
|
||||||
|
if oc == nil {
|
||||||
|
nr.result = checkResult{Status: "unavailable"}
|
||||||
|
} else {
|
||||||
|
start := time.Now()
|
||||||
|
if err := oc.Health(ctx); err != nil {
|
||||||
|
nr.result = checkResult{Status: "error", Latency: time.Since(start).String(), Error: err.Error()}
|
||||||
|
} else {
|
||||||
|
nr.result = checkResult{Status: "ok", Latency: time.Since(start).String()}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ch <- nr
|
||||||
|
}()
|
||||||
|
|
||||||
|
// IPFS
|
||||||
|
go func() {
|
||||||
|
nr := namedResult{name: "ipfs"}
|
||||||
|
if g.ipfsClient == nil {
|
||||||
|
nr.result = checkResult{Status: "unavailable"}
|
||||||
|
} else {
|
||||||
|
start := time.Now()
|
||||||
|
if err := g.ipfsClient.Health(ctx); err != nil {
|
||||||
|
nr.result = checkResult{Status: "error", Latency: time.Since(start).String(), Error: err.Error()}
|
||||||
|
} else {
|
||||||
|
nr.result = checkResult{Status: "ok", Latency: time.Since(start).String()}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ch <- nr
|
||||||
|
}()
|
||||||
|
|
||||||
|
// LibP2P
|
||||||
|
go func() {
|
||||||
|
nr := namedResult{name: "libp2p"}
|
||||||
|
if g.client == nil {
|
||||||
|
nr.result = checkResult{Status: "unavailable"}
|
||||||
|
} else if h := g.client.Host(); h == nil {
|
||||||
|
nr.result = checkResult{Status: "unavailable"}
|
||||||
|
} else {
|
||||||
|
peers := len(h.Network().Peers())
|
||||||
|
nr.result = checkResult{Status: "ok", Peers: peers}
|
||||||
|
}
|
||||||
|
ch <- nr
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Collect
|
||||||
|
checks := make(map[string]checkResult, 4)
|
||||||
|
for i := 0; i < 4; i++ {
|
||||||
|
nr := <-ch
|
||||||
|
checks[nr.name] = nr.result
|
||||||
|
}
|
||||||
|
|
||||||
|
// Aggregate status.
|
||||||
|
// Critical: rqlite down → "unhealthy"
|
||||||
|
// Non-critical (olric, ipfs, libp2p) error → "degraded"
|
||||||
|
// "unavailable" means the client was never configured — not an error.
|
||||||
|
overallStatus := "healthy"
|
||||||
|
if c := checks["rqlite"]; c.Status == "error" {
|
||||||
|
overallStatus = "unhealthy"
|
||||||
|
}
|
||||||
|
if overallStatus == "healthy" {
|
||||||
|
for name, c := range checks {
|
||||||
|
if name == "rqlite" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if c.Status == "error" {
|
||||||
|
overallStatus = "degraded"
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
resp := struct {
|
httpStatus := http.StatusOK
|
||||||
Status string `json:"status"`
|
if overallStatus != "healthy" {
|
||||||
Server healthResponse `json:"server"`
|
httpStatus = http.StatusServiceUnavailable
|
||||||
Client *client.HealthStatus `json:"client"`
|
|
||||||
}{
|
|
||||||
Status: "ok",
|
|
||||||
Server: server,
|
|
||||||
Client: clientHealth,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
_ = json.NewEncoder(w).Encode(resp)
|
resp := map[string]any{
|
||||||
|
"status": overallStatus,
|
||||||
|
"server": map[string]any{
|
||||||
|
"started_at": g.startedAt,
|
||||||
|
"uptime": time.Since(g.startedAt).String(),
|
||||||
|
},
|
||||||
|
"checks": checks,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cache
|
||||||
|
g.healthCacheMu.Lock()
|
||||||
|
g.healthCache = &cachedHealthResult{
|
||||||
|
response: resp,
|
||||||
|
httpStatus: httpStatus,
|
||||||
|
cachedAt: time.Now(),
|
||||||
|
}
|
||||||
|
g.healthCacheMu.Unlock()
|
||||||
|
|
||||||
|
writeJSON(w, httpStatus, resp)
|
||||||
}
|
}
|
||||||
|
|
||||||
// statusHandler aggregates server uptime and network status
|
// statusHandler aggregates server uptime and network status
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user