Did a lot of cleanup and bug fixing

This commit is contained in:
anonpenguin23 2026-02-13 12:47:02 +02:00
parent ed82c8ca6b
commit 85a556d0a0
55 changed files with 1121 additions and 119 deletions

View File

@ -84,6 +84,10 @@ func main() {
case "db":
cli.HandleDBCommand(args)
// Cluster management
case "cluster":
cli.HandleClusterCommand(args)
// Cluster inspection
case "inspect":
cli.HandleInspectCommand(args)
@ -167,6 +171,14 @@ func showHelp() {
fmt.Printf(" namespace delete - Delete current namespace and all resources\n")
fmt.Printf(" namespace repair <name> - Repair under-provisioned cluster (add missing nodes)\n\n")
fmt.Printf("🔧 Cluster Management:\n")
fmt.Printf(" cluster status - Show cluster node status\n")
fmt.Printf(" cluster health - Run cluster health checks\n")
fmt.Printf(" cluster rqlite status - Show detailed Raft state\n")
fmt.Printf(" cluster rqlite voters - Show voter list\n")
fmt.Printf(" cluster rqlite backup - Trigger manual backup\n")
fmt.Printf(" cluster watch - Live cluster status monitor\n\n")
fmt.Printf("🔍 Cluster Inspection:\n")
fmt.Printf(" inspect - Inspect cluster health via SSH\n")
fmt.Printf(" inspect --env devnet - Inspect devnet nodes\n")

View File

@ -66,15 +66,25 @@ func main() {
// Create HTTP server for ACME challenge (port 80)
httpServer := &http.Server{
Addr: ":80",
Handler: manager.HTTPHandler(nil), // Redirects all HTTP traffic to HTTPS except ACME challenge
Addr: ":80",
Handler: manager.HTTPHandler(nil), // Redirects all HTTP traffic to HTTPS except ACME challenge
ReadHeaderTimeout: 10 * time.Second,
ReadTimeout: 60 * time.Second,
WriteTimeout: 120 * time.Second,
IdleTimeout: 120 * time.Second,
MaxHeaderBytes: 1 << 20, // 1MB
}
// Create HTTPS server (port 443)
httpsServer := &http.Server{
Addr: ":443",
Handler: gw.Routes(),
TLSConfig: manager.TLSConfig(),
Addr: ":443",
Handler: gw.Routes(),
TLSConfig: manager.TLSConfig(),
ReadHeaderTimeout: 10 * time.Second,
ReadTimeout: 60 * time.Second,
WriteTimeout: 120 * time.Second,
IdleTimeout: 120 * time.Second,
MaxHeaderBytes: 1 << 20, // 1MB
}
// Start HTTP server for ACME challenge
@ -161,8 +171,13 @@ func main() {
// Standard HTTP server (no HTTPS)
server := &http.Server{
Addr: cfg.ListenAddr,
Handler: gw.Routes(),
Addr: cfg.ListenAddr,
Handler: gw.Routes(),
ReadHeaderTimeout: 10 * time.Second,
ReadTimeout: 60 * time.Second,
WriteTimeout: 120 * time.Second,
IdleTimeout: 120 * time.Second,
MaxHeaderBytes: 1 << 20, // 1MB
}
// Try to bind listener explicitly so binding failures are visible immediately.

View File

@ -38,9 +38,11 @@ func HandleCommand(args []string) {
case "start":
lifecycle.HandleStart()
case "stop":
lifecycle.HandleStop()
force := hasFlag(subargs, "--force")
lifecycle.HandleStopWithFlags(force)
case "restart":
lifecycle.HandleRestart()
force := hasFlag(subargs, "--force")
lifecycle.HandleRestartWithFlags(force)
case "logs":
logs.Handle(subargs)
case "uninstall":
@ -54,6 +56,16 @@ func HandleCommand(args []string) {
}
}
// hasFlag checks if a flag is present in the args slice
func hasFlag(args []string, flag string) bool {
for _, a := range args {
if a == flag {
return true
}
}
return false
}
// ShowHelp displays help information for production commands
func ShowHelp() {
fmt.Printf("Production Environment Commands\n\n")
@ -88,7 +100,11 @@ func ShowHelp() {
fmt.Printf(" status - Show status of production services\n")
fmt.Printf(" start - Start all production services (requires root/sudo)\n")
fmt.Printf(" stop - Stop all production services (requires root/sudo)\n")
fmt.Printf(" Options:\n")
fmt.Printf(" --force - Bypass quorum safety check\n")
fmt.Printf(" restart - Restart all production services (requires root/sudo)\n")
fmt.Printf(" Options:\n")
fmt.Printf(" --force - Bypass quorum safety check\n")
fmt.Printf(" logs <service> - View production service logs\n")
fmt.Printf(" Service aliases: node, ipfs, cluster, gateway, olric\n")
fmt.Printf(" Options:\n")

View File

@ -4,58 +4,97 @@ import (
"fmt"
"os"
"os/exec"
"time"
"github.com/DeBrosOfficial/network/pkg/cli/utils"
)
// HandleRestart restarts all production services
func HandleRestart() {
HandleRestartWithFlags(false)
}
// HandleRestartForce restarts all production services, bypassing quorum checks
func HandleRestartForce() {
HandleRestartWithFlags(true)
}
// HandleRestartWithFlags restarts all production services with optional force flag
func HandleRestartWithFlags(force bool) {
if os.Geteuid() != 0 {
fmt.Fprintf(os.Stderr, "❌ Production commands must be run as root (use sudo)\n")
fmt.Fprintf(os.Stderr, "Error: Production commands must be run as root (use sudo)\n")
os.Exit(1)
}
// Pre-flight: check if restarting this node would temporarily break quorum
if !force {
if warning := checkQuorumSafety(); warning != "" {
fmt.Fprintf(os.Stderr, "\nWARNING: %s\n", warning)
fmt.Fprintf(os.Stderr, "Use 'orama prod restart --force' to proceed anyway.\n\n")
os.Exit(1)
}
}
fmt.Printf("Restarting all DeBros production services...\n")
services := utils.GetProductionServices()
if len(services) == 0 {
fmt.Printf(" ⚠️ No DeBros services found\n")
fmt.Printf(" No DeBros services found\n")
return
}
// Stop all active services first
fmt.Printf(" Stopping services...\n")
// Ordered stop: gateway first, then node (RQLite), then supporting services
fmt.Printf("\n Stopping services (ordered)...\n")
shutdownOrder := [][]string{
{"debros-gateway"},
{"debros-node"},
{"debros-olric"},
{"debros-ipfs-cluster", "debros-ipfs"},
{"debros-anyone-relay", "anyone-client"},
{"coredns", "caddy"},
}
for _, group := range shutdownOrder {
for _, svc := range group {
if !containsService(services, svc) {
continue
}
active, _ := utils.IsServiceActive(svc)
if !active {
fmt.Printf(" %s was already stopped\n", svc)
continue
}
if err := exec.Command("systemctl", "stop", svc).Run(); err != nil {
fmt.Printf(" Warning: Failed to stop %s: %v\n", svc, err)
} else {
fmt.Printf(" Stopped %s\n", svc)
}
}
time.Sleep(1 * time.Second)
}
// Stop any remaining services not in the ordered list
for _, svc := range services {
active, err := utils.IsServiceActive(svc)
if err != nil {
fmt.Printf(" ⚠️ Unable to check %s: %v\n", svc, err)
continue
}
if !active {
fmt.Printf(" %s was already stopped\n", svc)
continue
}
if err := exec.Command("systemctl", "stop", svc).Run(); err != nil {
fmt.Printf(" ⚠️ Failed to stop %s: %v\n", svc, err)
} else {
fmt.Printf(" ✓ Stopped %s\n", svc)
active, _ := utils.IsServiceActive(svc)
if active {
_ = exec.Command("systemctl", "stop", svc).Run()
}
}
// Check port availability before restarting
ports, err := utils.CollectPortsForServices(services, false)
if err != nil {
fmt.Fprintf(os.Stderr, "❌ %v\n", err)
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}
if err := utils.EnsurePortsAvailable("prod restart", ports); err != nil {
fmt.Fprintf(os.Stderr, "❌ %v\n", err)
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}
// Start all services in dependency order (namespace: rqlite → olric → gateway)
fmt.Printf(" Starting services...\n")
// Start all services in dependency order
fmt.Printf("\n Starting services...\n")
utils.StartServicesOrdered(services, "start")
fmt.Printf("\n All services restarted\n")
fmt.Printf("\n All services restarted\n")
}

View File

@ -12,11 +12,30 @@ import (
// HandleStop stops all production services
func HandleStop() {
HandleStopWithFlags(false)
}
// HandleStopForce stops all production services, bypassing quorum checks
func HandleStopForce() {
HandleStopWithFlags(true)
}
// HandleStopWithFlags stops all production services with optional force flag
func HandleStopWithFlags(force bool) {
if os.Geteuid() != 0 {
fmt.Fprintf(os.Stderr, "❌ Production commands must be run as root (use sudo)\n")
fmt.Fprintf(os.Stderr, "Error: Production commands must be run as root (use sudo)\n")
os.Exit(1)
}
// Pre-flight: check if stopping this node would break RQLite quorum
if !force {
if warning := checkQuorumSafety(); warning != "" {
fmt.Fprintf(os.Stderr, "\nWARNING: %s\n", warning)
fmt.Fprintf(os.Stderr, "Use 'orama prod stop --force' to proceed anyway.\n\n")
os.Exit(1)
}
}
fmt.Printf("Stopping all DeBros production services...\n")
// First, stop all namespace services
@ -25,28 +44,50 @@ func HandleStop() {
services := utils.GetProductionServices()
if len(services) == 0 {
fmt.Printf(" ⚠️ No DeBros services found\n")
fmt.Printf(" No DeBros services found\n")
return
}
fmt.Printf("\n Stopping main services...\n")
fmt.Printf("\n Stopping main services (ordered)...\n")
// Ordered shutdown: gateway first, then node (RQLite), then supporting services
// This ensures we stop accepting requests before shutting down the database
shutdownOrder := [][]string{
{"debros-gateway"}, // 1. Stop accepting new requests
{"debros-node"}, // 2. Stop node (includes RQLite with leadership transfer)
{"debros-olric"}, // 3. Stop cache
{"debros-ipfs-cluster", "debros-ipfs"}, // 4. Stop storage
{"debros-anyone-relay", "anyone-client"}, // 5. Stop privacy relay
{"coredns", "caddy"}, // 6. Stop DNS/TLS last
}
// First, disable all services to prevent auto-restart
disableArgs := []string{"disable"}
disableArgs = append(disableArgs, services...)
if err := exec.Command("systemctl", disableArgs...).Run(); err != nil {
fmt.Printf(" ⚠️ Warning: Failed to disable some services: %v\n", err)
fmt.Printf(" Warning: Failed to disable some services: %v\n", err)
}
// Stop all services at once using a single systemctl command
// This is more efficient and ensures they all stop together
stopArgs := []string{"stop"}
stopArgs = append(stopArgs, services...)
if err := exec.Command("systemctl", stopArgs...).Run(); err != nil {
fmt.Printf(" ⚠️ Warning: Some services may have failed to stop: %v\n", err)
// Continue anyway - we'll verify and handle individually below
// Stop services in order with brief pauses between groups
for _, group := range shutdownOrder {
for _, svc := range group {
if !containsService(services, svc) {
continue
}
if err := exec.Command("systemctl", "stop", svc).Run(); err != nil {
// Not all services may exist on all nodes
} else {
fmt.Printf(" Stopped %s\n", svc)
}
}
time.Sleep(2 * time.Second) // Brief pause between groups for drain
}
// Stop any remaining services not in the ordered list
remainingStopArgs := []string{"stop"}
remainingStopArgs = append(remainingStopArgs, services...)
_ = exec.Command("systemctl", remainingStopArgs...).Run()
// Wait a moment for services to fully stop
time.Sleep(2 * time.Second)
@ -61,7 +102,9 @@ func HandleStop() {
time.Sleep(1 * time.Second)
// Stop again to ensure they're stopped
if err := exec.Command("systemctl", stopArgs...).Run(); err != nil {
secondStopArgs := []string{"stop"}
secondStopArgs = append(secondStopArgs, services...)
if err := exec.Command("systemctl", secondStopArgs...).Run(); err != nil {
fmt.Printf(" ⚠️ Warning: Second stop attempt had errors: %v\n", err)
}
time.Sleep(1 * time.Second)

View File

@ -179,7 +179,7 @@ func (p *RQLitePlugin) handleNXDomain(ctx context.Context, w dns.ResponseWriter,
Name: p.zones[0],
Rrtype: dns.TypeSOA,
Class: dns.ClassINET,
Ttl: 300,
Ttl: 60,
},
Ns: "ns1." + p.zones[0],
Mbox: "admin." + p.zones[0],
@ -187,7 +187,7 @@ func (p *RQLitePlugin) handleNXDomain(ctx context.Context, w dns.ResponseWriter,
Refresh: 3600,
Retry: 600,
Expire: 86400,
Minttl: 300,
Minttl: 60,
}
msg.Ns = append(msg.Ns, soa)

View File

@ -40,7 +40,7 @@ func parseConfig(c *caddy.Controller) (*RQLitePlugin, error) {
var (
dsn = "http://localhost:5001"
refreshRate = 10 * time.Second
cacheTTL = 300 * time.Second
cacheTTL = 60 * time.Second
cacheSize = 10000
zones []string
)

View File

@ -179,6 +179,16 @@ func (cg *ConfigGenerator) GenerateNodeConfig(peerAddresses []string, vpsIP stri
WGIP: vpsIP,
}
// Set MinClusterSize based on whether this is a genesis or joining node.
// Genesis nodes (no join address) bootstrap alone, so MinClusterSize=1.
// Joining nodes should wait for at least 2 remote peers before writing peers.json
// to prevent accidental solo bootstrap during mass restarts.
if rqliteJoinAddr != "" {
data.MinClusterSize = 3
} else {
data.MinClusterSize = 1
}
// RQLite node-to-node TLS encryption is disabled by default
// This simplifies certificate management - RQLite uses plain TCP for internal Raft
// HTTPS is still used for client-facing gateway traffic via autocert

View File

@ -56,6 +56,10 @@ ProtectControlGroups=yes
RestrictRealtime=yes
RestrictSUIDSGID=yes
ReadWritePaths=%[3]s
LimitNOFILE=65536
TimeoutStopSec=30
KillMode=mixed
MemoryMax=4G
[Install]
WantedBy=multi-user.target
@ -107,6 +111,10 @@ ProtectControlGroups=yes
RestrictRealtime=yes
RestrictSUIDSGID=yes
ReadWritePaths=%[1]s
LimitNOFILE=65536
TimeoutStopSec=30
KillMode=mixed
MemoryMax=2G
[Install]
WantedBy=multi-user.target
@ -162,6 +170,9 @@ ProtectControlGroups=yes
RestrictRealtime=yes
RestrictSUIDSGID=yes
ReadWritePaths=%[4]s
LimitNOFILE=65536
TimeoutStopSec=30
KillMode=mixed
[Install]
WantedBy=multi-user.target
@ -201,6 +212,10 @@ ProtectControlGroups=yes
RestrictRealtime=yes
RestrictSUIDSGID=yes
ReadWritePaths=%[4]s
LimitNOFILE=65536
TimeoutStopSec=30
KillMode=mixed
MemoryMax=4G
[Install]
WantedBy=multi-user.target
@ -233,10 +248,21 @@ StandardOutput=append:%[4]s
StandardError=append:%[4]s
SyslogIdentifier=debros-node
NoNewPrivileges=yes
PrivateTmp=yes
ProtectSystem=strict
ProtectHome=read-only
ProtectKernelTunables=yes
ProtectKernelModules=yes
ProtectControlGroups=yes
RestrictRealtime=yes
RestrictSUIDSGID=yes
ReadWritePaths=%[2]s /etc/systemd/system
LimitNOFILE=65536
TimeoutStopSec=30
KillMode=mixed
MemoryMax=8G
OOMScoreAdjust=-500
[Install]
WantedBy=multi-user.target
@ -278,6 +304,10 @@ ProtectControlGroups=yes
RestrictRealtime=yes
RestrictSUIDSGID=yes
ReadWritePaths=%[2]s
LimitNOFILE=65536
TimeoutStopSec=30
KillMode=mixed
MemoryMax=4G
[Install]
WantedBy=multi-user.target
@ -317,6 +347,10 @@ ProtectControlGroups=yes
RestrictRealtime=yes
RestrictSUIDSGID=yes
ReadWritePaths=%[3]s
LimitNOFILE=65536
TimeoutStopSec=30
KillMode=mixed
MemoryMax=1G
[Install]
WantedBy=multi-user.target
@ -346,7 +380,15 @@ NoNewPrivileges=yes
ProtectSystem=full
ProtectHome=read-only
PrivateTmp=yes
ProtectKernelTunables=yes
ProtectKernelModules=yes
RestrictRealtime=yes
RestrictSUIDSGID=yes
ReadWritePaths=/var/lib/anon /var/log/anon /etc/anon
LimitNOFILE=65536
TimeoutStopSec=30
KillMode=mixed
MemoryMax=2G
[Install]
WantedBy=multi-user.target
@ -372,6 +414,10 @@ SyslogIdentifier=coredns
NoNewPrivileges=true
ProtectSystem=full
ProtectHome=true
LimitNOFILE=65536
TimeoutStopSec=30
KillMode=mixed
MemoryMax=1G
[Install]
WantedBy=multi-user.target
@ -402,6 +448,8 @@ AmbientCapabilities=CAP_NET_BIND_SERVICE
Restart=on-failure
RestartSec=5
SyslogIdentifier=caddy
KillMode=mixed
MemoryMax=2G
[Install]
WantedBy=multi-user.target

View File

@ -22,7 +22,7 @@ database:
{{end}}{{if .NodeNoVerify}}node_no_verify: true
{{end}}{{end}}cluster_sync_interval: "30s"
peer_inactivity_limit: "24h"
min_cluster_size: 1
min_cluster_size: {{if .MinClusterSize}}{{.MinClusterSize}}{{else}}1{{end}}
ipfs:
cluster_api_url: "http://localhost:{{.ClusterAPIPort}}"
api_url: "http://localhost:{{.IPFSAPIPort}}"

View File

@ -33,6 +33,7 @@ type NodeConfigData struct {
HTTPPort int // HTTP port for ACME challenges (usually 80)
HTTPSPort int // HTTPS port (usually 443)
WGIP string // WireGuard IP address (e.g., 10.0.0.1)
MinClusterSize int // Minimum cluster size for RQLite discovery (1 for genesis, 3 for joining)
// Node-to-node TLS encryption for RQLite Raft communication
// Required when using SNI gateway for Raft traffic routing

View File

@ -26,6 +26,7 @@ func (g *Gateway) acmePresentHandler(w http.ResponseWriter, r *http.Request) {
return
}
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
var req ACMERequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
g.logger.Error("Failed to decode ACME present request", zap.Error(err))
@ -83,6 +84,7 @@ func (g *Gateway) acmeCleanupHandler(w http.ResponseWriter, r *http.Request) {
return
}
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
var req ACMERequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
g.logger.Error("Failed to decode ACME cleanup request", zap.Error(err))

View File

@ -0,0 +1,121 @@
package gateway
import (
"net/http"
"sync"
"time"
)
// CircuitState represents the current state of a circuit breaker
type CircuitState int
const (
CircuitClosed CircuitState = iota // Normal operation
CircuitOpen // Fast-failing
CircuitHalfOpen // Probing with a single request
)
const (
defaultFailureThreshold = 5
defaultOpenDuration = 30 * time.Second
)
// CircuitBreaker implements the circuit breaker pattern per target.
type CircuitBreaker struct {
mu sync.Mutex
state CircuitState
failures int
failureThreshold int
lastFailure time.Time
openDuration time.Duration
}
// NewCircuitBreaker creates a circuit breaker with default settings.
func NewCircuitBreaker() *CircuitBreaker {
return &CircuitBreaker{
failureThreshold: defaultFailureThreshold,
openDuration: defaultOpenDuration,
}
}
// Allow checks whether a request should be allowed through.
// Returns false if the circuit is open (fast-fail).
func (cb *CircuitBreaker) Allow() bool {
cb.mu.Lock()
defer cb.mu.Unlock()
switch cb.state {
case CircuitClosed:
return true
case CircuitOpen:
if time.Since(cb.lastFailure) >= cb.openDuration {
cb.state = CircuitHalfOpen
return true
}
return false
case CircuitHalfOpen:
// Only one probe at a time — already in half-open means one is in flight
return false
}
return true
}
// RecordSuccess records a successful response, resetting the circuit.
func (cb *CircuitBreaker) RecordSuccess() {
cb.mu.Lock()
defer cb.mu.Unlock()
cb.failures = 0
cb.state = CircuitClosed
}
// RecordFailure records a failed response, potentially opening the circuit.
func (cb *CircuitBreaker) RecordFailure() {
cb.mu.Lock()
defer cb.mu.Unlock()
cb.failures++
cb.lastFailure = time.Now()
if cb.failures >= cb.failureThreshold {
cb.state = CircuitOpen
}
}
// IsResponseFailure checks if an HTTP response status indicates a backend failure
// that should count toward the circuit breaker threshold.
func IsResponseFailure(statusCode int) bool {
return statusCode == http.StatusBadGateway ||
statusCode == http.StatusServiceUnavailable ||
statusCode == http.StatusGatewayTimeout
}
// CircuitBreakerRegistry manages per-target circuit breakers.
type CircuitBreakerRegistry struct {
mu sync.RWMutex
breakers map[string]*CircuitBreaker
}
// NewCircuitBreakerRegistry creates a new registry.
func NewCircuitBreakerRegistry() *CircuitBreakerRegistry {
return &CircuitBreakerRegistry{
breakers: make(map[string]*CircuitBreaker),
}
}
// Get returns (or creates) a circuit breaker for the given target key.
func (r *CircuitBreakerRegistry) Get(target string) *CircuitBreaker {
r.mu.RLock()
cb, ok := r.breakers[target]
r.mu.RUnlock()
if ok {
return cb
}
r.mu.Lock()
defer r.mu.Unlock()
// Double-check after acquiring write lock
if cb, ok = r.breakers[target]; ok {
return cb
}
cb = NewCircuitBreaker()
r.breakers[target] = cb
return cb
}

21
pkg/gateway/connlimit.go Normal file
View File

@ -0,0 +1,21 @@
package gateway
import (
"net"
"golang.org/x/net/netutil"
)
const (
// DefaultMaxConnections is the maximum number of concurrent connections per server.
DefaultMaxConnections = 10000
)
// LimitedListener wraps a net.Listener with a maximum concurrent connection limit.
// When the limit is reached, new connections block until an existing one closes.
func LimitedListener(l net.Listener, maxConns int) net.Listener {
if maxConns <= 0 {
maxConns = DefaultMaxConnections
}
return netutil.LimitListener(l, maxConns)
}

View File

@ -2,11 +2,7 @@ package gateway
import (
"context"
"crypto/rand"
"crypto/rsa"
"crypto/x509"
"database/sql"
"encoding/pem"
"fmt"
"net"
"os"
@ -424,13 +420,11 @@ func initializeServerless(logger *logging.ColoredLogger, cfg *Config, deps *Depe
logger.Logger,
)
// Initialize auth service
// For now using ephemeral key, can be loaded from config later
key, _ := rsa.GenerateKey(rand.Reader, 2048)
keyPEM := pem.EncodeToMemory(&pem.Block{
Type: "RSA PRIVATE KEY",
Bytes: x509.MarshalPKCS1PrivateKey(key),
})
// Initialize auth service with persistent signing key
keyPEM, err := loadOrCreateSigningKey(cfg.DataDir, logger)
if err != nil {
return fmt.Errorf("failed to load or create JWT signing key: %w", err)
}
authService, err := auth.NewService(logger, networkClient, string(keyPEM), cfg.ClientNamespace)
if err != nil {
return fmt.Errorf("failed to initialize auth service: %w", err)

View File

@ -117,8 +117,9 @@ type Gateway struct {
// Request log batcher (aggregates writes instead of per-request inserts)
logBatcher *requestLogBatcher
// Rate limiter
rateLimiter *RateLimiter
// Rate limiters
rateLimiter *RateLimiter
namespaceRateLimiter *NamespaceRateLimiter
// WireGuard peer exchange
wireguardHandler *wireguardhandlers.Handler
@ -143,6 +144,12 @@ type Gateway struct {
// Node recovery handler (called when health monitor confirms a node dead or recovered)
nodeRecoverer authhandlers.NodeRecoverer
// Circuit breakers for proxy targets (per-target failure tracking)
circuitBreakers *CircuitBreakerRegistry
// Shared HTTP transport for proxy connections (connection pooling)
proxyTransport *http.Transport
}
// localSubscriber represents a WebSocket subscriber for local message delivery
@ -261,6 +268,12 @@ func New(logger *logging.ColoredLogger, cfg *Config) (*Gateway, error) {
authService: deps.AuthService,
localSubscribers: make(map[string][]*localSubscriber),
presenceMembers: make(map[string][]PresenceMember),
circuitBreakers: NewCircuitBreakerRegistry(),
proxyTransport: &http.Transport{
MaxIdleConns: 200,
MaxIdleConnsPerHost: 20,
IdleConnTimeout: 90 * time.Second,
},
}
// Resolve local WireGuard IP for local namespace gateway preference
@ -337,9 +350,12 @@ func New(logger *logging.ColoredLogger, cfg *Config) (*Gateway, error) {
// Initialize request log batcher (flush every 5 seconds)
gw.logBatcher = newRequestLogBatcher(gw, 5*time.Second, 100)
// Initialize rate limiter (10000 req/min, burst 5000)
// Initialize rate limiters
// Per-IP: 10000 req/min, burst 5000
gw.rateLimiter = NewRateLimiter(10000, 5000)
gw.rateLimiter.StartCleanup(5*time.Minute, 10*time.Minute)
// Per-namespace: 60000 req/hr (1000/min), burst 500
gw.namespaceRateLimiter = NewNamespaceRateLimiter(1000, 500)
// Initialize WireGuard peer exchange handler
if deps.ORMClient != nil {

View File

@ -25,6 +25,7 @@ func (h *Handlers) IssueAPIKeyHandler(w http.ResponseWriter, r *http.Request) {
return
}
r.Body = http.MaxBytesReader(w, r.Body, 64*1024) // 64KB
var req APIKeyRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeError(w, http.StatusBadRequest, "invalid json body")
@ -139,6 +140,7 @@ func (h *Handlers) SimpleAPIKeyHandler(w http.ResponseWriter, r *http.Request) {
return
}
r.Body = http.MaxBytesReader(w, r.Body, 64*1024) // 64KB
var req SimpleAPIKeyRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeError(w, http.StatusBadRequest, "invalid json body")

View File

@ -24,6 +24,7 @@ func (h *Handlers) ChallengeHandler(w http.ResponseWriter, r *http.Request) {
return
}
r.Body = http.MaxBytesReader(w, r.Body, 64*1024) // 64KB
var req ChallengeRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeError(w, http.StatusBadRequest, "invalid json body")

View File

@ -86,6 +86,7 @@ func (h *Handlers) RefreshHandler(w http.ResponseWriter, r *http.Request) {
return
}
r.Body = http.MaxBytesReader(w, r.Body, 64*1024) // 64KB
var req RefreshRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeError(w, http.StatusBadRequest, "invalid json body")
@ -130,6 +131,7 @@ func (h *Handlers) LogoutHandler(w http.ResponseWriter, r *http.Request) {
return
}
r.Body = http.MaxBytesReader(w, r.Body, 64*1024) // 64KB
var req LogoutRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeError(w, http.StatusBadRequest, "invalid json body")

View File

@ -24,6 +24,7 @@ func (h *Handlers) VerifyHandler(w http.ResponseWriter, r *http.Request) {
return
}
r.Body = http.MaxBytesReader(w, r.Body, 64*1024) // 64KB
var req VerifyRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeError(w, http.StatusBadRequest, "invalid json body")

View File

@ -73,6 +73,7 @@ func (h *Handlers) RegisterHandler(w http.ResponseWriter, r *http.Request) {
return
}
r.Body = http.MaxBytesReader(w, r.Body, 64*1024) // 64KB
var req RegisterRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeError(w, http.StatusBadRequest, "invalid json body")

View File

@ -41,6 +41,7 @@ func (h *CacheHandlers) DeleteHandler(w http.ResponseWriter, r *http.Request) {
return
}
r.Body = http.MaxBytesReader(w, r.Body, 10<<20) // 10MB
var req DeleteRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeError(w, http.StatusBadRequest, "invalid json body")

View File

@ -43,6 +43,7 @@ func (h *CacheHandlers) GetHandler(w http.ResponseWriter, r *http.Request) {
return
}
r.Body = http.MaxBytesReader(w, r.Body, 10<<20) // 10MB
var req GetRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeError(w, http.StatusBadRequest, "invalid json body")
@ -135,6 +136,7 @@ func (h *CacheHandlers) MultiGetHandler(w http.ResponseWriter, r *http.Request)
return
}
r.Body = http.MaxBytesReader(w, r.Body, 10<<20) // 10MB
var req MultiGetRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeError(w, http.StatusBadRequest, "invalid json body")

View File

@ -40,6 +40,7 @@ func (h *CacheHandlers) ScanHandler(w http.ResponseWriter, r *http.Request) {
return
}
r.Body = http.MaxBytesReader(w, r.Body, 10<<20) // 10MB
var req ScanRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeError(w, http.StatusBadRequest, "invalid json body")

View File

@ -51,6 +51,7 @@ func (h *CacheHandlers) SetHandler(w http.ResponseWriter, r *http.Request) {
return
}
r.Body = http.MaxBytesReader(w, r.Body, 10<<20) // 10MB
var req PutRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeError(w, http.StatusBadRequest, "invalid json body")

View File

@ -38,6 +38,7 @@ func (h *DomainHandler) HandleAddDomain(w http.ResponseWriter, r *http.Request)
return
}
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
var req struct {
DeploymentName string `json:"deployment_name"`
Domain string `json:"domain"`
@ -156,6 +157,7 @@ func (h *DomainHandler) HandleVerifyDomain(w http.ResponseWriter, r *http.Reques
return
}
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
var req struct {
Domain string `json:"domain"`
}

View File

@ -75,6 +75,7 @@ func (h *ReplicaHandler) HandleSetup(w http.ResponseWriter, r *http.Request) {
return
}
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
var req replicaSetupRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "Invalid request body", http.StatusBadRequest)
@ -194,6 +195,7 @@ func (h *ReplicaHandler) HandleUpdate(w http.ResponseWriter, r *http.Request) {
return
}
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
var req replicaUpdateRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "Invalid request body", http.StatusBadRequest)
@ -338,6 +340,7 @@ func (h *ReplicaHandler) HandleTeardown(w http.ResponseWriter, r *http.Request)
return
}
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
var req replicaTeardownRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "Invalid request body", http.StatusBadRequest)

View File

@ -37,6 +37,7 @@ func (h *RollbackHandler) HandleRollback(w http.ResponseWriter, r *http.Request)
return
}
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
var req struct {
Name string `json:"name"`
Version int `json:"version"`

View File

@ -84,6 +84,7 @@ func (h *Handler) HandleJoin(w http.ResponseWriter, r *http.Request) {
return
}
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
var req JoinRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "invalid request body", http.StatusBadRequest)

View File

@ -86,6 +86,7 @@ func (h *SpawnHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
return
}
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
var req SpawnRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeSpawnResponse(w, http.StatusBadRequest, SpawnResponse{Error: "invalid request body"})

View File

@ -155,6 +155,7 @@ func (h *StatusHandler) HandleProvision(w http.ResponseWriter, r *http.Request)
return
}
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
var req ProvisionRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeError(w, http.StatusBadRequest, "invalid json body")

View File

@ -27,6 +27,7 @@ func (p *PubSubHandlers) PublishHandler(w http.ResponseWriter, r *http.Request)
writeError(w, http.StatusForbidden, "namespace not resolved")
return
}
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
var body PublishRequest
if err := json.NewDecoder(r.Body).Decode(&body); err != nil || body.Topic == "" || body.DataB64 == "" {
writeError(w, http.StatusBadRequest, "invalid body: expected {topic,data_base64}")

View File

@ -41,6 +41,7 @@ func (h *BackupHandler) BackupDatabase(w http.ResponseWriter, r *http.Request) {
DatabaseName string `json:"database_name"`
}
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "Invalid request body", http.StatusBadRequest)
return

View File

@ -74,6 +74,7 @@ func (h *SQLiteHandler) CreateDatabase(w http.ResponseWriter, r *http.Request) {
DatabaseName string `json:"database_name"`
}
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeCreateError(w, http.StatusBadRequest, "Invalid request body")
return

View File

@ -44,6 +44,7 @@ func (h *SQLiteHandler) QueryDatabase(w http.ResponseWriter, r *http.Request) {
return
}
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
var req QueryRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeJSONError(w, http.StatusBadRequest, "Invalid request body")

View File

@ -23,6 +23,7 @@ func (h *Handlers) PinHandler(w http.ResponseWriter, r *http.Request) {
return
}
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
var req StoragePinRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
httputil.WriteError(w, http.StatusBadRequest, fmt.Sprintf("failed to decode request: %v", err))

View File

@ -74,6 +74,7 @@ func (h *Handlers) UploadHandler(w http.ResponseWriter, r *http.Request) {
}
} else {
// Handle JSON request with base64 data
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
var req StorageUploadRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
httputil.WriteError(w, http.StatusBadRequest, fmt.Sprintf("failed to decode request: %v", err))

View File

@ -58,6 +58,7 @@ func (h *Handler) HandleRegisterPeer(w http.ResponseWriter, r *http.Request) {
return
}
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
var req RegisterPeerRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "invalid request body", http.StatusBadRequest)

View File

@ -194,15 +194,21 @@ func (hg *HTTPGateway) Start(ctx context.Context) error {
}
hg.server = &http.Server{
Addr: hg.config.ListenAddr,
Handler: hg.router,
Addr: hg.config.ListenAddr,
Handler: hg.router,
ReadHeaderTimeout: 10 * time.Second,
ReadTimeout: 60 * time.Second,
WriteTimeout: 120 * time.Second,
IdleTimeout: 120 * time.Second,
MaxHeaderBytes: 1 << 20, // 1MB
}
// Listen for connections
listener, err := net.Listen("tcp", hg.config.ListenAddr)
// Listen for connections with a max concurrent connection limit
rawListener, err := net.Listen("tcp", hg.config.ListenAddr)
if err != nil {
return fmt.Errorf("failed to listen on %s: %w", hg.config.ListenAddr, err)
}
listener := LimitedListener(rawListener, DefaultMaxConnections)
hg.logger.ComponentInfo(logging.ComponentGeneral, "HTTP Gateway server starting",
zap.String("node_name", hg.config.NodeName),

View File

@ -111,8 +111,13 @@ func (g *HTTPSGateway) Start(ctx context.Context) error {
// Start HTTP server for ACME challenge and redirect
g.httpServer = &http.Server{
Addr: fmt.Sprintf(":%d", httpPort),
Handler: g.httpHandler(),
Addr: fmt.Sprintf(":%d", httpPort),
Handler: g.httpHandler(),
ReadHeaderTimeout: 10 * time.Second,
ReadTimeout: 60 * time.Second,
WriteTimeout: 120 * time.Second,
IdleTimeout: 120 * time.Second,
MaxHeaderBytes: 1 << 20, // 1MB
}
go func() {
@ -143,15 +148,21 @@ func (g *HTTPSGateway) Start(ctx context.Context) error {
// Start HTTPS server
g.httpsServer = &http.Server{
Addr: fmt.Sprintf(":%d", httpsPort),
Handler: g.router,
TLSConfig: tlsConfig,
Addr: fmt.Sprintf(":%d", httpsPort),
Handler: g.router,
TLSConfig: tlsConfig,
ReadHeaderTimeout: 10 * time.Second,
ReadTimeout: 60 * time.Second,
WriteTimeout: 120 * time.Second,
IdleTimeout: 120 * time.Second,
MaxHeaderBytes: 1 << 20, // 1MB
}
listener, err := tls.Listen("tcp", g.httpsServer.Addr, tlsConfig)
rawListener, err := tls.Listen("tcp", g.httpsServer.Addr, tlsConfig)
if err != nil {
return fmt.Errorf("failed to create TLS listener: %w", err)
}
listener := LimitedListener(rawListener, DefaultMaxConnections)
g.logger.ComponentInfo(logging.ComponentGeneral, "HTTPS Gateway starting",
zap.String("domain", g.httpsConfig.Domain),

View File

@ -179,14 +179,15 @@ func (g *Gateway) proxyWebSocket(w http.ResponseWriter, r *http.Request, targetH
// withMiddleware adds CORS, security headers, rate limiting, and logging middleware
func (g *Gateway) withMiddleware(next http.Handler) http.Handler {
// Order: logging -> security headers -> rate limit -> CORS -> domain routing -> auth -> handler
// Order: logging -> security headers -> rate limit -> CORS -> domain routing -> auth -> namespace rate limit -> handler
return g.loggingMiddleware(
g.securityHeadersMiddleware(
g.rateLimitMiddleware(
g.corsMiddleware(
g.domainRoutingMiddleware(
g.authMiddleware(
g.authorizationMiddleware(next)))))))
g.authorizationMiddleware(
g.namespaceRateLimitMiddleware(next))))))))
}
// securityHeadersMiddleware adds standard security headers to all responses
@ -406,13 +407,16 @@ func extractAPIKey(r *http.Request) string {
}
}
// Fallback to query parameter (for WebSocket support)
if v := strings.TrimSpace(r.URL.Query().Get("api_key")); v != "" {
return v
}
// Also check token query parameter (alternative name)
if v := strings.TrimSpace(r.URL.Query().Get("token")); v != "" {
return v
// Fallback to query parameter ONLY for WebSocket upgrade requests.
// WebSocket clients cannot set custom headers, so query params are the
// only way to authenticate. For regular HTTP requests, require headers.
if isWebSocketUpgrade(r) {
if v := strings.TrimSpace(r.URL.Query().Get("api_key")); v != "" {
return v
}
if v := strings.TrimSpace(r.URL.Query().Get("token")); v != "" {
return v
}
}
return ""
}
@ -658,13 +662,20 @@ func requiresNamespaceOwnership(p string) bool {
return false
}
// corsMiddleware applies permissive CORS headers suitable for early development
// corsMiddleware applies CORS headers. Allows requests from the configured base
// domain and its subdomains. Falls back to permissive "*" only if no base domain
// is configured.
func (g *Gateway) corsMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Access-Control-Allow-Origin", "*")
origin := r.Header.Get("Origin")
allowedOrigin := g.getAllowedOrigin(origin)
w.Header().Set("Access-Control-Allow-Origin", allowedOrigin)
w.Header().Set("Access-Control-Allow-Methods", "GET, PUT, POST, DELETE, OPTIONS")
w.Header().Set("Access-Control-Allow-Headers", "Content-Type, Authorization, X-API-Key")
w.Header().Set("Access-Control-Max-Age", strconv.Itoa(600))
if allowedOrigin != "*" {
w.Header().Set("Vary", "Origin")
}
if r.Method == http.MethodOptions {
w.WriteHeader(http.StatusNoContent)
return
@ -673,6 +684,36 @@ func (g *Gateway) corsMiddleware(next http.Handler) http.Handler {
})
}
// getAllowedOrigin returns the allowed origin for CORS based on the request origin.
// If no base domain is configured, allows all origins (*).
// Otherwise, allows the base domain and any subdomain of it.
func (g *Gateway) getAllowedOrigin(origin string) string {
if g.cfg.BaseDomain == "" {
return "*"
}
if origin == "" {
return "https://" + g.cfg.BaseDomain
}
// Extract hostname from origin (e.g., "https://app.dbrs.space" -> "app.dbrs.space")
host := origin
if idx := strings.Index(host, "://"); idx != -1 {
host = host[idx+3:]
}
// Strip port if present
if idx := strings.Index(host, ":"); idx != -1 {
host = host[:idx]
}
// Allow exact match or subdomain match
if host == g.cfg.BaseDomain || strings.HasSuffix(host, "."+g.cfg.BaseDomain) {
return origin
}
// Also allow common development origins
if host == "localhost" || host == "127.0.0.1" {
return origin
}
return "https://" + g.cfg.BaseDomain
}
// persistRequestLog writes request metadata to the database (best-effort)
func (g *Gateway) persistRequestLog(r *http.Request, srw *statusResponseWriter, dur time.Duration) {
if g.client == nil {
@ -1024,10 +1065,19 @@ func (g *Gateway) handleNamespaceGatewayRequest(w http.ResponseWriter, r *http.R
proxyReq.Header.Set(HeaderInternalAuthNamespace, validatedNamespace)
}
// Execute proxy request
httpClient := &http.Client{Timeout: 30 * time.Second}
// Circuit breaker: check if target is healthy before sending request
cbKey := "ns:" + gatewayIP
cb := g.circuitBreakers.Get(cbKey)
if !cb.Allow() {
http.Error(w, "Namespace gateway unavailable (circuit open)", http.StatusServiceUnavailable)
return
}
// Execute proxy request using shared transport for connection pooling
httpClient := &http.Client{Timeout: 30 * time.Second, Transport: g.proxyTransport}
resp, err := httpClient.Do(proxyReq)
if err != nil {
cb.RecordFailure()
g.logger.ComponentError(logging.ComponentGeneral, "namespace gateway proxy request failed",
zap.String("namespace", namespaceName),
zap.String("target", gatewayIP),
@ -1038,6 +1088,12 @@ func (g *Gateway) handleNamespaceGatewayRequest(w http.ResponseWriter, r *http.R
}
defer resp.Body.Close()
if IsResponseFailure(resp.StatusCode) {
cb.RecordFailure()
} else {
cb.RecordSuccess()
}
// Copy response headers
for key, values := range resp.Header {
for _, value := range values {
@ -1255,8 +1311,8 @@ serveLocal:
}
}
// Execute proxy request
httpClient := &http.Client{Timeout: 30 * time.Second}
// Execute proxy request using shared transport
httpClient := &http.Client{Timeout: 30 * time.Second, Transport: g.proxyTransport}
resp, err := httpClient.Do(proxyReq)
if err != nil {
g.logger.ComponentError(logging.ComponentGeneral, "local proxy request failed",
@ -1354,13 +1410,19 @@ func (g *Gateway) proxyCrossNode(w http.ResponseWriter, r *http.Request, deploym
proxyReq.Header.Set("X-Forwarded-For", getClientIP(r))
proxyReq.Header.Set("X-Orama-Proxy-Node", g.nodePeerID) // Prevent loops
// Simple HTTP client for internal node-to-node communication
httpClient := &http.Client{
Timeout: 120 * time.Second,
// Circuit breaker: check if target node is healthy
cbKey := "node:" + homeIP
cb := g.circuitBreakers.Get(cbKey)
if !cb.Allow() {
g.logger.Warn("Cross-node proxy skipped (circuit open)", zap.String("target_ip", homeIP))
return false
}
// Internal node-to-node communication using shared transport
httpClient := &http.Client{Timeout: 120 * time.Second, Transport: g.proxyTransport}
resp, err := httpClient.Do(proxyReq)
if err != nil {
cb.RecordFailure()
g.logger.Error("Cross-node proxy request failed",
zap.String("target_ip", homeIP),
zap.String("host", r.Host),
@ -1369,6 +1431,12 @@ func (g *Gateway) proxyCrossNode(w http.ResponseWriter, r *http.Request, deploym
}
defer resp.Body.Close()
if IsResponseFailure(resp.StatusCode) {
cb.RecordFailure()
} else {
cb.RecordSuccess()
}
// Copy response headers
for key, values := range resp.Header {
for _, value := range values {
@ -1465,9 +1533,18 @@ func (g *Gateway) proxyCrossNodeToIP(w http.ResponseWriter, r *http.Request, dep
proxyReq.Header.Set("X-Forwarded-For", getClientIP(r))
proxyReq.Header.Set("X-Orama-Proxy-Node", g.nodePeerID)
httpClient := &http.Client{Timeout: 5 * time.Second}
// Circuit breaker: skip this replica if it's been failing
cbKey := "node:" + nodeIP
cb := g.circuitBreakers.Get(cbKey)
if !cb.Allow() {
g.logger.Warn("Replica proxy skipped (circuit open)", zap.String("target_ip", nodeIP))
return false
}
httpClient := &http.Client{Timeout: 5 * time.Second, Transport: g.proxyTransport}
resp, err := httpClient.Do(proxyReq)
if err != nil {
cb.RecordFailure()
g.logger.Warn("Replica proxy request failed",
zap.String("target_ip", nodeIP),
zap.Error(err),
@ -1477,13 +1554,15 @@ func (g *Gateway) proxyCrossNodeToIP(w http.ResponseWriter, r *http.Request, dep
defer resp.Body.Close()
// If the remote node returned a gateway error, try the next replica
if resp.StatusCode == http.StatusBadGateway || resp.StatusCode == http.StatusServiceUnavailable || resp.StatusCode == http.StatusGatewayTimeout {
if IsResponseFailure(resp.StatusCode) {
cb.RecordFailure()
g.logger.Warn("Replica returned gateway error, trying next",
zap.String("target_ip", nodeIP),
zap.Int("status", resp.StatusCode),
)
return false
}
cb.RecordSuccess()
for key, values := range resp.Header {
for _, value := range values {

View File

@ -82,6 +82,28 @@ func (rl *RateLimiter) StartCleanup(interval, maxAge time.Duration) {
}()
}
// NamespaceRateLimiter provides per-namespace rate limiting using a sync.Map
// for better concurrent performance than a single mutex.
type NamespaceRateLimiter struct {
limiters sync.Map // namespace -> *RateLimiter
rate int // per-minute rate per namespace
burst int
}
// NewNamespaceRateLimiter creates a per-namespace rate limiter.
func NewNamespaceRateLimiter(ratePerMinute, burst int) *NamespaceRateLimiter {
return &NamespaceRateLimiter{rate: ratePerMinute, burst: burst}
}
// Allow checks if a request for the given namespace should be allowed.
func (nrl *NamespaceRateLimiter) Allow(namespace string) bool {
if namespace == "" {
return true
}
val, _ := nrl.limiters.LoadOrStore(namespace, NewRateLimiter(nrl.rate, nrl.burst))
return val.(*RateLimiter).Allow(namespace)
}
// rateLimitMiddleware returns 429 when a client exceeds the rate limit.
// Internal traffic from the WireGuard subnet (10.0.0.0/8) is exempt.
func (g *Gateway) rateLimitMiddleware(next http.Handler) http.Handler {
@ -106,6 +128,27 @@ func (g *Gateway) rateLimitMiddleware(next http.Handler) http.Handler {
})
}
// namespaceRateLimitMiddleware enforces per-namespace rate limits.
// It runs after auth middleware so the namespace is available in context.
func (g *Gateway) namespaceRateLimitMiddleware(next http.Handler) http.Handler {
if g.namespaceRateLimiter == nil {
return next
}
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// Extract namespace from context (set by auth middleware)
if v := r.Context().Value(CtxKeyNamespaceOverride); v != nil {
if ns, ok := v.(string); ok && ns != "" {
if !g.namespaceRateLimiter.Allow(ns) {
w.Header().Set("Retry-After", "60")
http.Error(w, "namespace rate limit exceeded", http.StatusTooManyRequests)
return
}
}
}
next.ServeHTTP(w, r)
})
}
// isInternalIP returns true if the IP is in the WireGuard 10.0.0.0/8 subnet
// or is a loopback address.
func isInternalIP(ipStr string) bool {

View File

@ -0,0 +1,63 @@
package gateway
import (
"crypto/rand"
"crypto/rsa"
"crypto/x509"
"encoding/pem"
"fmt"
"os"
"path/filepath"
"github.com/DeBrosOfficial/network/pkg/logging"
"go.uber.org/zap"
)
const jwtKeyFileName = "jwt-signing-key.pem"
// loadOrCreateSigningKey loads the JWT signing key from disk, or generates a new one
// if none exists. This ensures JWTs survive gateway restarts.
func loadOrCreateSigningKey(dataDir string, logger *logging.ColoredLogger) ([]byte, error) {
keyPath := filepath.Join(dataDir, "secrets", jwtKeyFileName)
// Try to load existing key
if keyPEM, err := os.ReadFile(keyPath); err == nil && len(keyPEM) > 0 {
// Verify the key is valid
block, _ := pem.Decode(keyPEM)
if block != nil {
if _, err := x509.ParsePKCS1PrivateKey(block.Bytes); err == nil {
logger.ComponentInfo(logging.ComponentGeneral, "Loaded existing JWT signing key",
zap.String("path", keyPath))
return keyPEM, nil
}
}
logger.ComponentWarn(logging.ComponentGeneral, "Existing JWT signing key is invalid, generating new one",
zap.String("path", keyPath))
}
// Generate new key
key, err := rsa.GenerateKey(rand.Reader, 2048)
if err != nil {
return nil, fmt.Errorf("generate RSA key: %w", err)
}
keyPEM := pem.EncodeToMemory(&pem.Block{
Type: "RSA PRIVATE KEY",
Bytes: x509.MarshalPKCS1PrivateKey(key),
})
// Ensure secrets directory exists
secretsDir := filepath.Dir(keyPath)
if err := os.MkdirAll(secretsDir, 0700); err != nil {
return nil, fmt.Errorf("create secrets directory: %w", err)
}
// Write key with restrictive permissions
if err := os.WriteFile(keyPath, keyPEM, 0600); err != nil {
return nil, fmt.Errorf("write signing key: %w", err)
}
logger.ComponentInfo(logging.ComponentGeneral, "Generated and saved new JWT signing key",
zap.String("path", keyPath))
return keyPEM, nil
}

View File

@ -130,8 +130,13 @@ func (n *Node) startHTTPGateway(ctx context.Context) error {
go func() {
server := &http.Server{
Addr: gwCfg.ListenAddr,
Handler: apiGateway.Routes(),
Addr: gwCfg.ListenAddr,
Handler: apiGateway.Routes(),
ReadHeaderTimeout: 10 * time.Second,
ReadTimeout: 60 * time.Second,
WriteTimeout: 120 * time.Second,
IdleTimeout: 120 * time.Second,
MaxHeaderBytes: 1 << 20, // 1MB
}
n.apiGatewayServer = server

199
pkg/rqlite/backup.go Normal file
View File

@ -0,0 +1,199 @@
package rqlite
import (
"context"
"fmt"
"io"
"net/http"
"os"
"path/filepath"
"sort"
"strings"
"time"
"go.uber.org/zap"
)
const (
defaultBackupInterval = 1 * time.Hour
maxBackupRetention = 24
backupDirName = "backups/rqlite"
backupPrefix = "rqlite-backup-"
backupSuffix = ".db"
backupTimestampFormat = "20060102-150405"
)
// startBackupLoop runs a periodic backup of the RQLite database.
// It saves consistent SQLite snapshots to the local backup directory.
// Only the leader node performs backups; followers skip silently.
func (r *RQLiteManager) startBackupLoop(ctx context.Context) {
interval := r.config.BackupInterval
if interval <= 0 {
interval = defaultBackupInterval
}
r.logger.Info("RQLite backup loop started",
zap.Duration("interval", interval),
zap.Int("max_retention", maxBackupRetention))
// Wait before the first backup to let the cluster stabilize
select {
case <-ctx.Done():
return
case <-time.After(interval):
}
ticker := time.NewTicker(interval)
defer ticker.Stop()
// Run the first backup immediately after the initial wait
r.performBackup()
for {
select {
case <-ctx.Done():
r.logger.Info("RQLite backup loop stopped")
return
case <-ticker.C:
r.performBackup()
}
}
}
// performBackup executes a single backup cycle: check leadership, take snapshot, prune old backups.
func (r *RQLiteManager) performBackup() {
// Only the leader should perform backups to avoid duplicate work
if !r.isLeaderNode() {
r.logger.Debug("Skipping backup: this node is not the leader")
return
}
backupDir := r.backupDir()
if err := os.MkdirAll(backupDir, 0755); err != nil {
r.logger.Error("Failed to create backup directory",
zap.String("dir", backupDir),
zap.Error(err))
return
}
timestamp := time.Now().UTC().Format(backupTimestampFormat)
filename := fmt.Sprintf("%s%s%s", backupPrefix, timestamp, backupSuffix)
backupPath := filepath.Join(backupDir, filename)
if err := r.downloadBackup(backupPath); err != nil {
r.logger.Error("Failed to download RQLite backup",
zap.String("path", backupPath),
zap.Error(err))
// Clean up partial file
_ = os.Remove(backupPath)
return
}
info, err := os.Stat(backupPath)
if err != nil {
r.logger.Error("Failed to stat backup file",
zap.String("path", backupPath),
zap.Error(err))
return
}
r.logger.Info("RQLite backup completed",
zap.String("path", backupPath),
zap.Int64("size_bytes", info.Size()))
r.pruneOldBackups(backupDir)
}
// isLeaderNode checks whether this node is currently the Raft leader.
func (r *RQLiteManager) isLeaderNode() bool {
status, err := r.getRQLiteStatus()
if err != nil {
r.logger.Debug("Cannot determine leader status, skipping backup", zap.Error(err))
return false
}
return status.Store.Raft.State == "Leader"
}
// backupDir returns the path to the backup directory.
func (r *RQLiteManager) backupDir() string {
return filepath.Join(r.dataDir, backupDirName)
}
// downloadBackup calls the RQLite backup API and writes the SQLite snapshot to disk.
func (r *RQLiteManager) downloadBackup(destPath string) error {
url := fmt.Sprintf("http://localhost:%d/db/backup", r.config.RQLitePort)
client := &http.Client{Timeout: 2 * time.Minute}
resp, err := client.Get(url)
if err != nil {
return fmt.Errorf("request backup endpoint: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return fmt.Errorf("backup endpoint returned %d: %s", resp.StatusCode, string(body))
}
outFile, err := os.Create(destPath)
if err != nil {
return fmt.Errorf("create backup file: %w", err)
}
defer outFile.Close()
written, err := io.Copy(outFile, resp.Body)
if err != nil {
return fmt.Errorf("write backup data: %w", err)
}
if written == 0 {
return fmt.Errorf("backup file is empty")
}
return nil
}
// pruneOldBackups removes the oldest backup files, keeping only the most recent maxBackupRetention.
func (r *RQLiteManager) pruneOldBackups(backupDir string) {
entries, err := os.ReadDir(backupDir)
if err != nil {
r.logger.Error("Failed to list backup directory for pruning",
zap.String("dir", backupDir),
zap.Error(err))
return
}
// Collect only backup files matching our naming convention
var backupFiles []os.DirEntry
for _, entry := range entries {
if !entry.IsDir() && strings.HasPrefix(entry.Name(), backupPrefix) && strings.HasSuffix(entry.Name(), backupSuffix) {
backupFiles = append(backupFiles, entry)
}
}
if len(backupFiles) <= maxBackupRetention {
return
}
// Sort by name ascending (timestamp in name ensures chronological order)
sort.Slice(backupFiles, func(i, j int) bool {
return backupFiles[i].Name() < backupFiles[j].Name()
})
// Remove the oldest files beyond the retention limit
toDelete := backupFiles[:len(backupFiles)-maxBackupRetention]
for _, entry := range toDelete {
path := filepath.Join(backupDir, entry.Name())
if err := os.Remove(path); err != nil {
r.logger.Warn("Failed to delete old backup",
zap.String("path", path),
zap.Error(err))
} else {
r.logger.Debug("Pruned old backup", zap.String("path", path))
}
}
r.logger.Info("Pruned old backups",
zap.Int("deleted", len(toDelete)),
zap.Int("remaining", maxBackupRetention))
}

View File

@ -119,17 +119,15 @@ func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rql
time.Sleep(2 * time.Second)
}
// Even if we only discovered ourselves, write peers.json as a fallback
// This ensures RQLite has consistent state and can potentially recover
// when other nodes come online
// If we only discovered ourselves, do NOT write a single-node peers.json.
// Writing single-node peers.json causes RQLite to bootstrap as a solo cluster,
// making it impossible to rejoin the actual cluster later (-join fails with
// "single-node cluster, joining not supported"). Let RQLite start with its
// existing Raft state or use the -join flag to connect.
if discoveredPeers <= 1 {
r.logger.Warn("Only discovered self during pre-start discovery, writing single-node peers.json as fallback",
r.logger.Warn("Only discovered self during pre-start discovery, skipping peers.json write to prevent solo bootstrap",
zap.Int("discovered_peers", discoveredPeers),
zap.Int("min_cluster_size", r.config.MinClusterSize))
// Still write peers.json with just ourselves - better than nothing
if err := r.discoveryService.ForceWritePeersJSON(); err != nil {
r.logger.Warn("Failed to write single-node peers.json fallback", zap.Error(err))
}
return nil
}

View File

@ -145,6 +145,11 @@ func (r *RQLiteManager) launchProcess(ctx context.Context, rqliteDataDir string)
return fmt.Errorf("failed to start RQLite: %w", err)
}
// Write PID file for reliable orphan detection
pidPath := filepath.Join(logsDir, "rqlited.pid")
_ = os.WriteFile(pidPath, []byte(fmt.Sprintf("%d", r.cmd.Process.Pid)), 0644)
r.logger.Info("RQLite process started", zap.Int("pid", r.cmd.Process.Pid), zap.String("pid_file", pidPath))
logFile.Close()
return nil
}

View File

@ -3,6 +3,7 @@ package rqlite
import (
"context"
"fmt"
"os"
"os/exec"
"syscall"
"time"
@ -71,6 +72,12 @@ func (r *RQLiteManager) Start(ctx context.Context) error {
go r.startVoterReconciliation(ctx)
}
// Start child process watchdog to detect and recover from crashes
go r.startProcessWatchdog(ctx)
// Start periodic RQLite backup loop (leader-only, self-checking)
go r.startBackupLoop(ctx)
if err := r.establishLeadershipOrJoin(ctx, rqliteDataDir); err != nil {
return err
}
@ -92,7 +99,9 @@ func (r *RQLiteManager) GetConnection() *gorqlite.Connection {
return r.connection
}
// Stop stops the RQLite node
// Stop stops the RQLite node gracefully.
// If this node is the Raft leader, it attempts a leadership transfer first
// to minimize cluster disruption.
func (r *RQLiteManager) Stop() error {
if r.connection != nil {
r.connection.Close()
@ -103,16 +112,52 @@ func (r *RQLiteManager) Stop() error {
return nil
}
// Attempt leadership transfer if we are the leader
r.transferLeadershipIfLeader()
_ = r.cmd.Process.Signal(syscall.SIGTERM)
done := make(chan error, 1)
go func() { done <- r.cmd.Wait() }()
// Give RQLite 30s to flush pending writes and shut down gracefully
// (previously 5s which risked Raft log corruption)
select {
case <-done:
case <-time.After(5 * time.Second):
case <-time.After(30 * time.Second):
r.logger.Warn("RQLite did not stop within 30s, sending SIGKILL")
_ = r.cmd.Process.Kill()
}
// Clean up PID file
r.cleanupPIDFile()
return nil
}
// transferLeadershipIfLeader checks if this node is the Raft leader and
// requests a leadership transfer to minimize election disruption.
func (r *RQLiteManager) transferLeadershipIfLeader() {
status, err := r.getRQLiteStatus()
if err != nil {
return
}
if status.Store.Raft.State != "Leader" {
return
}
r.logger.Info("This node is the Raft leader, requesting leadership transfer before shutdown")
// RQLite doesn't have a direct leadership transfer API, but we can
// signal readiness to step down. The fastest approach is to let the
// SIGTERM handler in rqlited handle this — rqlite v8 gracefully
// steps down on SIGTERM when possible. We log the state for visibility.
r.logger.Info("Leader will transfer on SIGTERM (rqlite built-in behavior)")
}
// cleanupPIDFile removes the PID file on shutdown
func (r *RQLiteManager) cleanupPIDFile() {
logsDir := fmt.Sprintf("%s/../logs", r.dataDir)
pidPath := logsDir + "/rqlited.pid"
_ = os.Remove(pidPath)
}

View File

@ -36,16 +36,16 @@ func (r *RQLiteManager) prepareDataDir() (string, error) {
}
func (r *RQLiteManager) hasExistingState(rqliteDataDir string) bool {
entries, err := os.ReadDir(rqliteDataDir)
// Check specifically for raft.db with non-trivial content.
// Previously this checked for ANY file in the data dir, which was too broad —
// auto-discovery creates peers.json and log files before RQLite starts,
// causing false positives that skip the -join flag on restart.
raftDB := filepath.Join(rqliteDataDir, "raft.db")
info, err := os.Stat(raftDB)
if err != nil {
return false
}
for _, e := range entries {
if e.Name() != "." && e.Name() != ".." {
return true
}
}
return false
return info.Size() > 1024
}
func (r *RQLiteManager) exponentialBackoff(attempt int, baseDelay time.Duration, maxDelay time.Duration) time.Duration {

View File

@ -76,14 +76,24 @@ func TestHasExistingState(t *testing.T) {
t.Errorf("hasExistingState() = true; want false for empty dir")
}
// Test directory with a file
// Test directory with only non-raft files (should still be false)
testFile := filepath.Join(tmpDir, "test.txt")
if err := os.WriteFile(testFile, []byte("data"), 0644); err != nil {
t.Fatalf("failed to create test file: %v", err)
}
if r.hasExistingState(tmpDir) {
t.Errorf("hasExistingState() = true; want false for dir with only non-raft files")
}
// Test directory with raft.db (should be true)
raftDB := filepath.Join(tmpDir, "raft.db")
if err := os.WriteFile(raftDB, make([]byte, 2048), 0644); err != nil {
t.Fatalf("failed to create raft.db: %v", err)
}
if !r.hasExistingState(tmpDir) {
t.Errorf("hasExistingState() = false; want true for non-empty dir")
t.Errorf("hasExistingState() = false; want true for dir with raft.db")
}
}

View File

@ -132,21 +132,67 @@ func (r *RQLiteManager) reconcileVoters() error {
// cluster and immediately re-adding it with the desired voter flag.
// This is necessary because RQLite's /join endpoint ignores voter flag changes
// for nodes that are already cluster members with the same ID and address.
//
// Safety improvements:
// - Pre-check: verify quorum would survive the temporary removal
// - Rollback: if rejoin fails, attempt to re-add with original status
// - Retry: attempt rejoin up to 3 times with backoff
func (r *RQLiteManager) changeNodeVoterStatus(nodeID string, voter bool) error {
// Pre-check: if demoting a voter, verify quorum safety
if !voter {
nodes, err := r.getAllClusterNodes()
if err != nil {
return fmt.Errorf("quorum pre-check: %w", err)
}
voterCount := 0
for _, n := range nodes {
if n.Voter && n.Reachable {
voterCount++
}
}
// After removing this voter, we need (voterCount-1)/2 + 1 for quorum
// which means voterCount-1 > (voterCount-1)/2, i.e., voterCount >= 3
if voterCount <= 2 {
return fmt.Errorf("cannot remove voter: only %d reachable voters, quorum would be lost", voterCount)
}
}
// Step 1: Remove the node from the cluster
if err := r.removeClusterNode(nodeID); err != nil {
return fmt.Errorf("remove node: %w", err)
}
// Brief pause for Raft to commit the configuration change
time.Sleep(2 * time.Second)
// Wait for Raft to commit the configuration change, then rejoin with retries
var lastErr error
for attempt := 0; attempt < 3; attempt++ {
waitTime := time.Duration(2+attempt*2) * time.Second // 2s, 4s, 6s
time.Sleep(waitTime)
// Step 2: Re-add with the correct voter status
if err := r.joinClusterNode(nodeID, nodeID, voter); err != nil {
return fmt.Errorf("rejoin node: %w", err)
if err := r.joinClusterNode(nodeID, nodeID, voter); err != nil {
lastErr = err
r.logger.Warn("Rejoin attempt failed, retrying",
zap.String("node_id", nodeID),
zap.Int("attempt", attempt+1),
zap.Error(err))
continue
}
return nil // Success
}
return nil
// All rejoin attempts failed — try to re-add with the ORIGINAL status as rollback
r.logger.Error("All rejoin attempts failed, attempting rollback",
zap.String("node_id", nodeID),
zap.Bool("desired_voter", voter),
zap.Error(lastErr))
originalVoter := !voter
if err := r.joinClusterNode(nodeID, nodeID, originalVoter); err != nil {
r.logger.Error("Rollback also failed — node may be orphaned from cluster",
zap.String("node_id", nodeID),
zap.Error(err))
}
return fmt.Errorf("rejoin node after 3 attempts: %w", lastErr)
}
// getAllClusterNodes queries /nodes?nonvoters&ver=2 to get all cluster members

99
pkg/rqlite/watchdog.go Normal file
View File

@ -0,0 +1,99 @@
package rqlite
import (
"context"
"fmt"
"net/http"
"time"
"go.uber.org/zap"
)
const (
watchdogInterval = 30 * time.Second
watchdogMaxRestart = 3
)
// startProcessWatchdog monitors the RQLite child process and restarts it if it crashes.
// It checks both process liveness and HTTP responsiveness.
func (r *RQLiteManager) startProcessWatchdog(ctx context.Context) {
ticker := time.NewTicker(watchdogInterval)
defer ticker.Stop()
restartCount := 0
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
if !r.isProcessAlive() {
r.logger.Error("RQLite process has died",
zap.Int("restart_count", restartCount),
zap.Int("max_restarts", watchdogMaxRestart))
if restartCount >= watchdogMaxRestart {
r.logger.Error("RQLite process watchdog: max restart attempts reached, giving up")
return
}
if err := r.restartProcess(ctx); err != nil {
r.logger.Error("Failed to restart RQLite process", zap.Error(err))
restartCount++
continue
}
restartCount++
r.logger.Info("RQLite process restarted by watchdog",
zap.Int("restart_count", restartCount))
} else {
// Process is alive — check HTTP responsiveness
if !r.isHTTPResponsive() {
r.logger.Warn("RQLite process is alive but not responding to HTTP")
}
}
}
}
}
// isProcessAlive checks if the RQLite child process is still running
func (r *RQLiteManager) isProcessAlive() bool {
if r.cmd == nil || r.cmd.Process == nil {
return false
}
// On Unix, sending signal 0 checks process existence without actually signaling
if err := r.cmd.Process.Signal(nil); err != nil {
return false
}
return true
}
// isHTTPResponsive checks if RQLite is responding to HTTP status requests
func (r *RQLiteManager) isHTTPResponsive() bool {
url := fmt.Sprintf("http://localhost:%d/status", r.config.RQLitePort)
client := &http.Client{Timeout: 5 * time.Second}
resp, err := client.Get(url)
if err != nil {
return false
}
defer resp.Body.Close()
return resp.StatusCode == http.StatusOK
}
// restartProcess attempts to restart the RQLite process
func (r *RQLiteManager) restartProcess(ctx context.Context) error {
rqliteDataDir, err := r.rqliteDataDirPath()
if err != nil {
return fmt.Errorf("get data dir: %w", err)
}
if err := r.launchProcess(ctx, rqliteDataDir); err != nil {
return fmt.Errorf("launch process: %w", err)
}
if err := r.waitForReadyAndConnect(ctx); err != nil {
return fmt.Errorf("wait for ready: %w", err)
}
return nil
}

View File

@ -27,7 +27,16 @@ StandardOutput=journal
StandardError=journal
SyslogIdentifier=debros-gateway-%i
# Security hardening
NoNewPrivileges=yes
ProtectSystem=strict
ProtectHome=read-only
ProtectKernelTunables=yes
ProtectKernelModules=yes
ReadWritePaths=/home/debros/.orama/data/namespaces
LimitNOFILE=65536
MemoryMax=1G
[Install]
WantedBy=multi-user.target

View File

@ -27,7 +27,16 @@ StandardOutput=journal
StandardError=journal
SyslogIdentifier=debros-olric-%i
# Security hardening
NoNewPrivileges=yes
ProtectSystem=strict
ProtectHome=read-only
ProtectKernelTunables=yes
ProtectKernelModules=yes
ReadWritePaths=/home/debros/.orama/data/namespaces
LimitNOFILE=65536
MemoryMax=2G
[Install]
WantedBy=multi-user.target

View File

@ -37,8 +37,17 @@ StandardOutput=journal
StandardError=journal
SyslogIdentifier=debros-rqlite-%i
# Security hardening
NoNewPrivileges=yes
ProtectSystem=strict
ProtectHome=read-only
ProtectKernelTunables=yes
ProtectKernelModules=yes
ReadWritePaths=/home/debros/.orama/data/namespaces
# Resource limits
LimitNOFILE=65536
MemoryMax=2G
[Install]
WantedBy=multi-user.target