mirror of
https://github.com/DeBrosOfficial/orama.git
synced 2026-03-17 03:33:01 +00:00
Did a lot of cleanup and bug fixing
This commit is contained in:
parent
ed82c8ca6b
commit
85a556d0a0
@ -84,6 +84,10 @@ func main() {
|
||||
case "db":
|
||||
cli.HandleDBCommand(args)
|
||||
|
||||
// Cluster management
|
||||
case "cluster":
|
||||
cli.HandleClusterCommand(args)
|
||||
|
||||
// Cluster inspection
|
||||
case "inspect":
|
||||
cli.HandleInspectCommand(args)
|
||||
@ -167,6 +171,14 @@ func showHelp() {
|
||||
fmt.Printf(" namespace delete - Delete current namespace and all resources\n")
|
||||
fmt.Printf(" namespace repair <name> - Repair under-provisioned cluster (add missing nodes)\n\n")
|
||||
|
||||
fmt.Printf("🔧 Cluster Management:\n")
|
||||
fmt.Printf(" cluster status - Show cluster node status\n")
|
||||
fmt.Printf(" cluster health - Run cluster health checks\n")
|
||||
fmt.Printf(" cluster rqlite status - Show detailed Raft state\n")
|
||||
fmt.Printf(" cluster rqlite voters - Show voter list\n")
|
||||
fmt.Printf(" cluster rqlite backup - Trigger manual backup\n")
|
||||
fmt.Printf(" cluster watch - Live cluster status monitor\n\n")
|
||||
|
||||
fmt.Printf("🔍 Cluster Inspection:\n")
|
||||
fmt.Printf(" inspect - Inspect cluster health via SSH\n")
|
||||
fmt.Printf(" inspect --env devnet - Inspect devnet nodes\n")
|
||||
|
||||
@ -66,15 +66,25 @@ func main() {
|
||||
|
||||
// Create HTTP server for ACME challenge (port 80)
|
||||
httpServer := &http.Server{
|
||||
Addr: ":80",
|
||||
Handler: manager.HTTPHandler(nil), // Redirects all HTTP traffic to HTTPS except ACME challenge
|
||||
Addr: ":80",
|
||||
Handler: manager.HTTPHandler(nil), // Redirects all HTTP traffic to HTTPS except ACME challenge
|
||||
ReadHeaderTimeout: 10 * time.Second,
|
||||
ReadTimeout: 60 * time.Second,
|
||||
WriteTimeout: 120 * time.Second,
|
||||
IdleTimeout: 120 * time.Second,
|
||||
MaxHeaderBytes: 1 << 20, // 1MB
|
||||
}
|
||||
|
||||
// Create HTTPS server (port 443)
|
||||
httpsServer := &http.Server{
|
||||
Addr: ":443",
|
||||
Handler: gw.Routes(),
|
||||
TLSConfig: manager.TLSConfig(),
|
||||
Addr: ":443",
|
||||
Handler: gw.Routes(),
|
||||
TLSConfig: manager.TLSConfig(),
|
||||
ReadHeaderTimeout: 10 * time.Second,
|
||||
ReadTimeout: 60 * time.Second,
|
||||
WriteTimeout: 120 * time.Second,
|
||||
IdleTimeout: 120 * time.Second,
|
||||
MaxHeaderBytes: 1 << 20, // 1MB
|
||||
}
|
||||
|
||||
// Start HTTP server for ACME challenge
|
||||
@ -161,8 +171,13 @@ func main() {
|
||||
|
||||
// Standard HTTP server (no HTTPS)
|
||||
server := &http.Server{
|
||||
Addr: cfg.ListenAddr,
|
||||
Handler: gw.Routes(),
|
||||
Addr: cfg.ListenAddr,
|
||||
Handler: gw.Routes(),
|
||||
ReadHeaderTimeout: 10 * time.Second,
|
||||
ReadTimeout: 60 * time.Second,
|
||||
WriteTimeout: 120 * time.Second,
|
||||
IdleTimeout: 120 * time.Second,
|
||||
MaxHeaderBytes: 1 << 20, // 1MB
|
||||
}
|
||||
|
||||
// Try to bind listener explicitly so binding failures are visible immediately.
|
||||
|
||||
@ -38,9 +38,11 @@ func HandleCommand(args []string) {
|
||||
case "start":
|
||||
lifecycle.HandleStart()
|
||||
case "stop":
|
||||
lifecycle.HandleStop()
|
||||
force := hasFlag(subargs, "--force")
|
||||
lifecycle.HandleStopWithFlags(force)
|
||||
case "restart":
|
||||
lifecycle.HandleRestart()
|
||||
force := hasFlag(subargs, "--force")
|
||||
lifecycle.HandleRestartWithFlags(force)
|
||||
case "logs":
|
||||
logs.Handle(subargs)
|
||||
case "uninstall":
|
||||
@ -54,6 +56,16 @@ func HandleCommand(args []string) {
|
||||
}
|
||||
}
|
||||
|
||||
// hasFlag checks if a flag is present in the args slice
|
||||
func hasFlag(args []string, flag string) bool {
|
||||
for _, a := range args {
|
||||
if a == flag {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// ShowHelp displays help information for production commands
|
||||
func ShowHelp() {
|
||||
fmt.Printf("Production Environment Commands\n\n")
|
||||
@ -88,7 +100,11 @@ func ShowHelp() {
|
||||
fmt.Printf(" status - Show status of production services\n")
|
||||
fmt.Printf(" start - Start all production services (requires root/sudo)\n")
|
||||
fmt.Printf(" stop - Stop all production services (requires root/sudo)\n")
|
||||
fmt.Printf(" Options:\n")
|
||||
fmt.Printf(" --force - Bypass quorum safety check\n")
|
||||
fmt.Printf(" restart - Restart all production services (requires root/sudo)\n")
|
||||
fmt.Printf(" Options:\n")
|
||||
fmt.Printf(" --force - Bypass quorum safety check\n")
|
||||
fmt.Printf(" logs <service> - View production service logs\n")
|
||||
fmt.Printf(" Service aliases: node, ipfs, cluster, gateway, olric\n")
|
||||
fmt.Printf(" Options:\n")
|
||||
|
||||
@ -4,58 +4,97 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/cli/utils"
|
||||
)
|
||||
|
||||
// HandleRestart restarts all production services
|
||||
func HandleRestart() {
|
||||
HandleRestartWithFlags(false)
|
||||
}
|
||||
|
||||
// HandleRestartForce restarts all production services, bypassing quorum checks
|
||||
func HandleRestartForce() {
|
||||
HandleRestartWithFlags(true)
|
||||
}
|
||||
|
||||
// HandleRestartWithFlags restarts all production services with optional force flag
|
||||
func HandleRestartWithFlags(force bool) {
|
||||
if os.Geteuid() != 0 {
|
||||
fmt.Fprintf(os.Stderr, "❌ Production commands must be run as root (use sudo)\n")
|
||||
fmt.Fprintf(os.Stderr, "Error: Production commands must be run as root (use sudo)\n")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Pre-flight: check if restarting this node would temporarily break quorum
|
||||
if !force {
|
||||
if warning := checkQuorumSafety(); warning != "" {
|
||||
fmt.Fprintf(os.Stderr, "\nWARNING: %s\n", warning)
|
||||
fmt.Fprintf(os.Stderr, "Use 'orama prod restart --force' to proceed anyway.\n\n")
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Printf("Restarting all DeBros production services...\n")
|
||||
|
||||
services := utils.GetProductionServices()
|
||||
if len(services) == 0 {
|
||||
fmt.Printf(" ⚠️ No DeBros services found\n")
|
||||
fmt.Printf(" No DeBros services found\n")
|
||||
return
|
||||
}
|
||||
|
||||
// Stop all active services first
|
||||
fmt.Printf(" Stopping services...\n")
|
||||
// Ordered stop: gateway first, then node (RQLite), then supporting services
|
||||
fmt.Printf("\n Stopping services (ordered)...\n")
|
||||
shutdownOrder := [][]string{
|
||||
{"debros-gateway"},
|
||||
{"debros-node"},
|
||||
{"debros-olric"},
|
||||
{"debros-ipfs-cluster", "debros-ipfs"},
|
||||
{"debros-anyone-relay", "anyone-client"},
|
||||
{"coredns", "caddy"},
|
||||
}
|
||||
|
||||
for _, group := range shutdownOrder {
|
||||
for _, svc := range group {
|
||||
if !containsService(services, svc) {
|
||||
continue
|
||||
}
|
||||
active, _ := utils.IsServiceActive(svc)
|
||||
if !active {
|
||||
fmt.Printf(" %s was already stopped\n", svc)
|
||||
continue
|
||||
}
|
||||
if err := exec.Command("systemctl", "stop", svc).Run(); err != nil {
|
||||
fmt.Printf(" Warning: Failed to stop %s: %v\n", svc, err)
|
||||
} else {
|
||||
fmt.Printf(" Stopped %s\n", svc)
|
||||
}
|
||||
}
|
||||
time.Sleep(1 * time.Second)
|
||||
}
|
||||
|
||||
// Stop any remaining services not in the ordered list
|
||||
for _, svc := range services {
|
||||
active, err := utils.IsServiceActive(svc)
|
||||
if err != nil {
|
||||
fmt.Printf(" ⚠️ Unable to check %s: %v\n", svc, err)
|
||||
continue
|
||||
}
|
||||
if !active {
|
||||
fmt.Printf(" ℹ️ %s was already stopped\n", svc)
|
||||
continue
|
||||
}
|
||||
if err := exec.Command("systemctl", "stop", svc).Run(); err != nil {
|
||||
fmt.Printf(" ⚠️ Failed to stop %s: %v\n", svc, err)
|
||||
} else {
|
||||
fmt.Printf(" ✓ Stopped %s\n", svc)
|
||||
active, _ := utils.IsServiceActive(svc)
|
||||
if active {
|
||||
_ = exec.Command("systemctl", "stop", svc).Run()
|
||||
}
|
||||
}
|
||||
|
||||
// Check port availability before restarting
|
||||
ports, err := utils.CollectPortsForServices(services, false)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "❌ %v\n", err)
|
||||
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
if err := utils.EnsurePortsAvailable("prod restart", ports); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "❌ %v\n", err)
|
||||
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Start all services in dependency order (namespace: rqlite → olric → gateway)
|
||||
fmt.Printf(" Starting services...\n")
|
||||
// Start all services in dependency order
|
||||
fmt.Printf("\n Starting services...\n")
|
||||
utils.StartServicesOrdered(services, "start")
|
||||
|
||||
fmt.Printf("\n✅ All services restarted\n")
|
||||
fmt.Printf("\n All services restarted\n")
|
||||
}
|
||||
|
||||
@ -12,11 +12,30 @@ import (
|
||||
|
||||
// HandleStop stops all production services
|
||||
func HandleStop() {
|
||||
HandleStopWithFlags(false)
|
||||
}
|
||||
|
||||
// HandleStopForce stops all production services, bypassing quorum checks
|
||||
func HandleStopForce() {
|
||||
HandleStopWithFlags(true)
|
||||
}
|
||||
|
||||
// HandleStopWithFlags stops all production services with optional force flag
|
||||
func HandleStopWithFlags(force bool) {
|
||||
if os.Geteuid() != 0 {
|
||||
fmt.Fprintf(os.Stderr, "❌ Production commands must be run as root (use sudo)\n")
|
||||
fmt.Fprintf(os.Stderr, "Error: Production commands must be run as root (use sudo)\n")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Pre-flight: check if stopping this node would break RQLite quorum
|
||||
if !force {
|
||||
if warning := checkQuorumSafety(); warning != "" {
|
||||
fmt.Fprintf(os.Stderr, "\nWARNING: %s\n", warning)
|
||||
fmt.Fprintf(os.Stderr, "Use 'orama prod stop --force' to proceed anyway.\n\n")
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Printf("Stopping all DeBros production services...\n")
|
||||
|
||||
// First, stop all namespace services
|
||||
@ -25,28 +44,50 @@ func HandleStop() {
|
||||
|
||||
services := utils.GetProductionServices()
|
||||
if len(services) == 0 {
|
||||
fmt.Printf(" ⚠️ No DeBros services found\n")
|
||||
fmt.Printf(" No DeBros services found\n")
|
||||
return
|
||||
}
|
||||
|
||||
fmt.Printf("\n Stopping main services...\n")
|
||||
fmt.Printf("\n Stopping main services (ordered)...\n")
|
||||
|
||||
// Ordered shutdown: gateway first, then node (RQLite), then supporting services
|
||||
// This ensures we stop accepting requests before shutting down the database
|
||||
shutdownOrder := [][]string{
|
||||
{"debros-gateway"}, // 1. Stop accepting new requests
|
||||
{"debros-node"}, // 2. Stop node (includes RQLite with leadership transfer)
|
||||
{"debros-olric"}, // 3. Stop cache
|
||||
{"debros-ipfs-cluster", "debros-ipfs"}, // 4. Stop storage
|
||||
{"debros-anyone-relay", "anyone-client"}, // 5. Stop privacy relay
|
||||
{"coredns", "caddy"}, // 6. Stop DNS/TLS last
|
||||
}
|
||||
|
||||
// First, disable all services to prevent auto-restart
|
||||
disableArgs := []string{"disable"}
|
||||
disableArgs = append(disableArgs, services...)
|
||||
if err := exec.Command("systemctl", disableArgs...).Run(); err != nil {
|
||||
fmt.Printf(" ⚠️ Warning: Failed to disable some services: %v\n", err)
|
||||
fmt.Printf(" Warning: Failed to disable some services: %v\n", err)
|
||||
}
|
||||
|
||||
// Stop all services at once using a single systemctl command
|
||||
// This is more efficient and ensures they all stop together
|
||||
stopArgs := []string{"stop"}
|
||||
stopArgs = append(stopArgs, services...)
|
||||
if err := exec.Command("systemctl", stopArgs...).Run(); err != nil {
|
||||
fmt.Printf(" ⚠️ Warning: Some services may have failed to stop: %v\n", err)
|
||||
// Continue anyway - we'll verify and handle individually below
|
||||
// Stop services in order with brief pauses between groups
|
||||
for _, group := range shutdownOrder {
|
||||
for _, svc := range group {
|
||||
if !containsService(services, svc) {
|
||||
continue
|
||||
}
|
||||
if err := exec.Command("systemctl", "stop", svc).Run(); err != nil {
|
||||
// Not all services may exist on all nodes
|
||||
} else {
|
||||
fmt.Printf(" Stopped %s\n", svc)
|
||||
}
|
||||
}
|
||||
time.Sleep(2 * time.Second) // Brief pause between groups for drain
|
||||
}
|
||||
|
||||
// Stop any remaining services not in the ordered list
|
||||
remainingStopArgs := []string{"stop"}
|
||||
remainingStopArgs = append(remainingStopArgs, services...)
|
||||
_ = exec.Command("systemctl", remainingStopArgs...).Run()
|
||||
|
||||
// Wait a moment for services to fully stop
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
@ -61,7 +102,9 @@ func HandleStop() {
|
||||
time.Sleep(1 * time.Second)
|
||||
|
||||
// Stop again to ensure they're stopped
|
||||
if err := exec.Command("systemctl", stopArgs...).Run(); err != nil {
|
||||
secondStopArgs := []string{"stop"}
|
||||
secondStopArgs = append(secondStopArgs, services...)
|
||||
if err := exec.Command("systemctl", secondStopArgs...).Run(); err != nil {
|
||||
fmt.Printf(" ⚠️ Warning: Second stop attempt had errors: %v\n", err)
|
||||
}
|
||||
time.Sleep(1 * time.Second)
|
||||
|
||||
@ -179,7 +179,7 @@ func (p *RQLitePlugin) handleNXDomain(ctx context.Context, w dns.ResponseWriter,
|
||||
Name: p.zones[0],
|
||||
Rrtype: dns.TypeSOA,
|
||||
Class: dns.ClassINET,
|
||||
Ttl: 300,
|
||||
Ttl: 60,
|
||||
},
|
||||
Ns: "ns1." + p.zones[0],
|
||||
Mbox: "admin." + p.zones[0],
|
||||
@ -187,7 +187,7 @@ func (p *RQLitePlugin) handleNXDomain(ctx context.Context, w dns.ResponseWriter,
|
||||
Refresh: 3600,
|
||||
Retry: 600,
|
||||
Expire: 86400,
|
||||
Minttl: 300,
|
||||
Minttl: 60,
|
||||
}
|
||||
msg.Ns = append(msg.Ns, soa)
|
||||
|
||||
|
||||
@ -40,7 +40,7 @@ func parseConfig(c *caddy.Controller) (*RQLitePlugin, error) {
|
||||
var (
|
||||
dsn = "http://localhost:5001"
|
||||
refreshRate = 10 * time.Second
|
||||
cacheTTL = 300 * time.Second
|
||||
cacheTTL = 60 * time.Second
|
||||
cacheSize = 10000
|
||||
zones []string
|
||||
)
|
||||
|
||||
@ -179,6 +179,16 @@ func (cg *ConfigGenerator) GenerateNodeConfig(peerAddresses []string, vpsIP stri
|
||||
WGIP: vpsIP,
|
||||
}
|
||||
|
||||
// Set MinClusterSize based on whether this is a genesis or joining node.
|
||||
// Genesis nodes (no join address) bootstrap alone, so MinClusterSize=1.
|
||||
// Joining nodes should wait for at least 2 remote peers before writing peers.json
|
||||
// to prevent accidental solo bootstrap during mass restarts.
|
||||
if rqliteJoinAddr != "" {
|
||||
data.MinClusterSize = 3
|
||||
} else {
|
||||
data.MinClusterSize = 1
|
||||
}
|
||||
|
||||
// RQLite node-to-node TLS encryption is disabled by default
|
||||
// This simplifies certificate management - RQLite uses plain TCP for internal Raft
|
||||
// HTTPS is still used for client-facing gateway traffic via autocert
|
||||
|
||||
@ -56,6 +56,10 @@ ProtectControlGroups=yes
|
||||
RestrictRealtime=yes
|
||||
RestrictSUIDSGID=yes
|
||||
ReadWritePaths=%[3]s
|
||||
LimitNOFILE=65536
|
||||
TimeoutStopSec=30
|
||||
KillMode=mixed
|
||||
MemoryMax=4G
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@ -107,6 +111,10 @@ ProtectControlGroups=yes
|
||||
RestrictRealtime=yes
|
||||
RestrictSUIDSGID=yes
|
||||
ReadWritePaths=%[1]s
|
||||
LimitNOFILE=65536
|
||||
TimeoutStopSec=30
|
||||
KillMode=mixed
|
||||
MemoryMax=2G
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@ -162,6 +170,9 @@ ProtectControlGroups=yes
|
||||
RestrictRealtime=yes
|
||||
RestrictSUIDSGID=yes
|
||||
ReadWritePaths=%[4]s
|
||||
LimitNOFILE=65536
|
||||
TimeoutStopSec=30
|
||||
KillMode=mixed
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@ -201,6 +212,10 @@ ProtectControlGroups=yes
|
||||
RestrictRealtime=yes
|
||||
RestrictSUIDSGID=yes
|
||||
ReadWritePaths=%[4]s
|
||||
LimitNOFILE=65536
|
||||
TimeoutStopSec=30
|
||||
KillMode=mixed
|
||||
MemoryMax=4G
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@ -233,10 +248,21 @@ StandardOutput=append:%[4]s
|
||||
StandardError=append:%[4]s
|
||||
SyslogIdentifier=debros-node
|
||||
|
||||
NoNewPrivileges=yes
|
||||
PrivateTmp=yes
|
||||
ProtectSystem=strict
|
||||
ProtectHome=read-only
|
||||
ProtectKernelTunables=yes
|
||||
ProtectKernelModules=yes
|
||||
ProtectControlGroups=yes
|
||||
RestrictRealtime=yes
|
||||
RestrictSUIDSGID=yes
|
||||
ReadWritePaths=%[2]s /etc/systemd/system
|
||||
LimitNOFILE=65536
|
||||
TimeoutStopSec=30
|
||||
KillMode=mixed
|
||||
MemoryMax=8G
|
||||
OOMScoreAdjust=-500
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@ -278,6 +304,10 @@ ProtectControlGroups=yes
|
||||
RestrictRealtime=yes
|
||||
RestrictSUIDSGID=yes
|
||||
ReadWritePaths=%[2]s
|
||||
LimitNOFILE=65536
|
||||
TimeoutStopSec=30
|
||||
KillMode=mixed
|
||||
MemoryMax=4G
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@ -317,6 +347,10 @@ ProtectControlGroups=yes
|
||||
RestrictRealtime=yes
|
||||
RestrictSUIDSGID=yes
|
||||
ReadWritePaths=%[3]s
|
||||
LimitNOFILE=65536
|
||||
TimeoutStopSec=30
|
||||
KillMode=mixed
|
||||
MemoryMax=1G
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@ -346,7 +380,15 @@ NoNewPrivileges=yes
|
||||
ProtectSystem=full
|
||||
ProtectHome=read-only
|
||||
PrivateTmp=yes
|
||||
ProtectKernelTunables=yes
|
||||
ProtectKernelModules=yes
|
||||
RestrictRealtime=yes
|
||||
RestrictSUIDSGID=yes
|
||||
ReadWritePaths=/var/lib/anon /var/log/anon /etc/anon
|
||||
LimitNOFILE=65536
|
||||
TimeoutStopSec=30
|
||||
KillMode=mixed
|
||||
MemoryMax=2G
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@ -372,6 +414,10 @@ SyslogIdentifier=coredns
|
||||
NoNewPrivileges=true
|
||||
ProtectSystem=full
|
||||
ProtectHome=true
|
||||
LimitNOFILE=65536
|
||||
TimeoutStopSec=30
|
||||
KillMode=mixed
|
||||
MemoryMax=1G
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@ -402,6 +448,8 @@ AmbientCapabilities=CAP_NET_BIND_SERVICE
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
SyslogIdentifier=caddy
|
||||
KillMode=mixed
|
||||
MemoryMax=2G
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
@ -22,7 +22,7 @@ database:
|
||||
{{end}}{{if .NodeNoVerify}}node_no_verify: true
|
||||
{{end}}{{end}}cluster_sync_interval: "30s"
|
||||
peer_inactivity_limit: "24h"
|
||||
min_cluster_size: 1
|
||||
min_cluster_size: {{if .MinClusterSize}}{{.MinClusterSize}}{{else}}1{{end}}
|
||||
ipfs:
|
||||
cluster_api_url: "http://localhost:{{.ClusterAPIPort}}"
|
||||
api_url: "http://localhost:{{.IPFSAPIPort}}"
|
||||
|
||||
@ -33,6 +33,7 @@ type NodeConfigData struct {
|
||||
HTTPPort int // HTTP port for ACME challenges (usually 80)
|
||||
HTTPSPort int // HTTPS port (usually 443)
|
||||
WGIP string // WireGuard IP address (e.g., 10.0.0.1)
|
||||
MinClusterSize int // Minimum cluster size for RQLite discovery (1 for genesis, 3 for joining)
|
||||
|
||||
// Node-to-node TLS encryption for RQLite Raft communication
|
||||
// Required when using SNI gateway for Raft traffic routing
|
||||
|
||||
@ -26,6 +26,7 @@ func (g *Gateway) acmePresentHandler(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
|
||||
var req ACMERequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
g.logger.Error("Failed to decode ACME present request", zap.Error(err))
|
||||
@ -83,6 +84,7 @@ func (g *Gateway) acmeCleanupHandler(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
|
||||
var req ACMERequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
g.logger.Error("Failed to decode ACME cleanup request", zap.Error(err))
|
||||
|
||||
121
pkg/gateway/circuit_breaker.go
Normal file
121
pkg/gateway/circuit_breaker.go
Normal file
@ -0,0 +1,121 @@
|
||||
package gateway
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// CircuitState represents the current state of a circuit breaker
|
||||
type CircuitState int
|
||||
|
||||
const (
|
||||
CircuitClosed CircuitState = iota // Normal operation
|
||||
CircuitOpen // Fast-failing
|
||||
CircuitHalfOpen // Probing with a single request
|
||||
)
|
||||
|
||||
const (
|
||||
defaultFailureThreshold = 5
|
||||
defaultOpenDuration = 30 * time.Second
|
||||
)
|
||||
|
||||
// CircuitBreaker implements the circuit breaker pattern per target.
|
||||
type CircuitBreaker struct {
|
||||
mu sync.Mutex
|
||||
state CircuitState
|
||||
failures int
|
||||
failureThreshold int
|
||||
lastFailure time.Time
|
||||
openDuration time.Duration
|
||||
}
|
||||
|
||||
// NewCircuitBreaker creates a circuit breaker with default settings.
|
||||
func NewCircuitBreaker() *CircuitBreaker {
|
||||
return &CircuitBreaker{
|
||||
failureThreshold: defaultFailureThreshold,
|
||||
openDuration: defaultOpenDuration,
|
||||
}
|
||||
}
|
||||
|
||||
// Allow checks whether a request should be allowed through.
|
||||
// Returns false if the circuit is open (fast-fail).
|
||||
func (cb *CircuitBreaker) Allow() bool {
|
||||
cb.mu.Lock()
|
||||
defer cb.mu.Unlock()
|
||||
|
||||
switch cb.state {
|
||||
case CircuitClosed:
|
||||
return true
|
||||
case CircuitOpen:
|
||||
if time.Since(cb.lastFailure) >= cb.openDuration {
|
||||
cb.state = CircuitHalfOpen
|
||||
return true
|
||||
}
|
||||
return false
|
||||
case CircuitHalfOpen:
|
||||
// Only one probe at a time — already in half-open means one is in flight
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// RecordSuccess records a successful response, resetting the circuit.
|
||||
func (cb *CircuitBreaker) RecordSuccess() {
|
||||
cb.mu.Lock()
|
||||
defer cb.mu.Unlock()
|
||||
cb.failures = 0
|
||||
cb.state = CircuitClosed
|
||||
}
|
||||
|
||||
// RecordFailure records a failed response, potentially opening the circuit.
|
||||
func (cb *CircuitBreaker) RecordFailure() {
|
||||
cb.mu.Lock()
|
||||
defer cb.mu.Unlock()
|
||||
cb.failures++
|
||||
cb.lastFailure = time.Now()
|
||||
if cb.failures >= cb.failureThreshold {
|
||||
cb.state = CircuitOpen
|
||||
}
|
||||
}
|
||||
|
||||
// IsResponseFailure checks if an HTTP response status indicates a backend failure
|
||||
// that should count toward the circuit breaker threshold.
|
||||
func IsResponseFailure(statusCode int) bool {
|
||||
return statusCode == http.StatusBadGateway ||
|
||||
statusCode == http.StatusServiceUnavailable ||
|
||||
statusCode == http.StatusGatewayTimeout
|
||||
}
|
||||
|
||||
// CircuitBreakerRegistry manages per-target circuit breakers.
|
||||
type CircuitBreakerRegistry struct {
|
||||
mu sync.RWMutex
|
||||
breakers map[string]*CircuitBreaker
|
||||
}
|
||||
|
||||
// NewCircuitBreakerRegistry creates a new registry.
|
||||
func NewCircuitBreakerRegistry() *CircuitBreakerRegistry {
|
||||
return &CircuitBreakerRegistry{
|
||||
breakers: make(map[string]*CircuitBreaker),
|
||||
}
|
||||
}
|
||||
|
||||
// Get returns (or creates) a circuit breaker for the given target key.
|
||||
func (r *CircuitBreakerRegistry) Get(target string) *CircuitBreaker {
|
||||
r.mu.RLock()
|
||||
cb, ok := r.breakers[target]
|
||||
r.mu.RUnlock()
|
||||
if ok {
|
||||
return cb
|
||||
}
|
||||
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
// Double-check after acquiring write lock
|
||||
if cb, ok = r.breakers[target]; ok {
|
||||
return cb
|
||||
}
|
||||
cb = NewCircuitBreaker()
|
||||
r.breakers[target] = cb
|
||||
return cb
|
||||
}
|
||||
21
pkg/gateway/connlimit.go
Normal file
21
pkg/gateway/connlimit.go
Normal file
@ -0,0 +1,21 @@
|
||||
package gateway
|
||||
|
||||
import (
|
||||
"net"
|
||||
|
||||
"golang.org/x/net/netutil"
|
||||
)
|
||||
|
||||
const (
|
||||
// DefaultMaxConnections is the maximum number of concurrent connections per server.
|
||||
DefaultMaxConnections = 10000
|
||||
)
|
||||
|
||||
// LimitedListener wraps a net.Listener with a maximum concurrent connection limit.
|
||||
// When the limit is reached, new connections block until an existing one closes.
|
||||
func LimitedListener(l net.Listener, maxConns int) net.Listener {
|
||||
if maxConns <= 0 {
|
||||
maxConns = DefaultMaxConnections
|
||||
}
|
||||
return netutil.LimitListener(l, maxConns)
|
||||
}
|
||||
@ -2,11 +2,7 @@ package gateway
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"crypto/rsa"
|
||||
"crypto/x509"
|
||||
"database/sql"
|
||||
"encoding/pem"
|
||||
"fmt"
|
||||
"net"
|
||||
"os"
|
||||
@ -424,13 +420,11 @@ func initializeServerless(logger *logging.ColoredLogger, cfg *Config, deps *Depe
|
||||
logger.Logger,
|
||||
)
|
||||
|
||||
// Initialize auth service
|
||||
// For now using ephemeral key, can be loaded from config later
|
||||
key, _ := rsa.GenerateKey(rand.Reader, 2048)
|
||||
keyPEM := pem.EncodeToMemory(&pem.Block{
|
||||
Type: "RSA PRIVATE KEY",
|
||||
Bytes: x509.MarshalPKCS1PrivateKey(key),
|
||||
})
|
||||
// Initialize auth service with persistent signing key
|
||||
keyPEM, err := loadOrCreateSigningKey(cfg.DataDir, logger)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load or create JWT signing key: %w", err)
|
||||
}
|
||||
authService, err := auth.NewService(logger, networkClient, string(keyPEM), cfg.ClientNamespace)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to initialize auth service: %w", err)
|
||||
|
||||
@ -117,8 +117,9 @@ type Gateway struct {
|
||||
// Request log batcher (aggregates writes instead of per-request inserts)
|
||||
logBatcher *requestLogBatcher
|
||||
|
||||
// Rate limiter
|
||||
rateLimiter *RateLimiter
|
||||
// Rate limiters
|
||||
rateLimiter *RateLimiter
|
||||
namespaceRateLimiter *NamespaceRateLimiter
|
||||
|
||||
// WireGuard peer exchange
|
||||
wireguardHandler *wireguardhandlers.Handler
|
||||
@ -143,6 +144,12 @@ type Gateway struct {
|
||||
|
||||
// Node recovery handler (called when health monitor confirms a node dead or recovered)
|
||||
nodeRecoverer authhandlers.NodeRecoverer
|
||||
|
||||
// Circuit breakers for proxy targets (per-target failure tracking)
|
||||
circuitBreakers *CircuitBreakerRegistry
|
||||
|
||||
// Shared HTTP transport for proxy connections (connection pooling)
|
||||
proxyTransport *http.Transport
|
||||
}
|
||||
|
||||
// localSubscriber represents a WebSocket subscriber for local message delivery
|
||||
@ -261,6 +268,12 @@ func New(logger *logging.ColoredLogger, cfg *Config) (*Gateway, error) {
|
||||
authService: deps.AuthService,
|
||||
localSubscribers: make(map[string][]*localSubscriber),
|
||||
presenceMembers: make(map[string][]PresenceMember),
|
||||
circuitBreakers: NewCircuitBreakerRegistry(),
|
||||
proxyTransport: &http.Transport{
|
||||
MaxIdleConns: 200,
|
||||
MaxIdleConnsPerHost: 20,
|
||||
IdleConnTimeout: 90 * time.Second,
|
||||
},
|
||||
}
|
||||
|
||||
// Resolve local WireGuard IP for local namespace gateway preference
|
||||
@ -337,9 +350,12 @@ func New(logger *logging.ColoredLogger, cfg *Config) (*Gateway, error) {
|
||||
// Initialize request log batcher (flush every 5 seconds)
|
||||
gw.logBatcher = newRequestLogBatcher(gw, 5*time.Second, 100)
|
||||
|
||||
// Initialize rate limiter (10000 req/min, burst 5000)
|
||||
// Initialize rate limiters
|
||||
// Per-IP: 10000 req/min, burst 5000
|
||||
gw.rateLimiter = NewRateLimiter(10000, 5000)
|
||||
gw.rateLimiter.StartCleanup(5*time.Minute, 10*time.Minute)
|
||||
// Per-namespace: 60000 req/hr (1000/min), burst 500
|
||||
gw.namespaceRateLimiter = NewNamespaceRateLimiter(1000, 500)
|
||||
|
||||
// Initialize WireGuard peer exchange handler
|
||||
if deps.ORMClient != nil {
|
||||
|
||||
@ -25,6 +25,7 @@ func (h *Handlers) IssueAPIKeyHandler(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 64*1024) // 64KB
|
||||
var req APIKeyRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid json body")
|
||||
@ -139,6 +140,7 @@ func (h *Handlers) SimpleAPIKeyHandler(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 64*1024) // 64KB
|
||||
var req SimpleAPIKeyRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid json body")
|
||||
|
||||
@ -24,6 +24,7 @@ func (h *Handlers) ChallengeHandler(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 64*1024) // 64KB
|
||||
var req ChallengeRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid json body")
|
||||
|
||||
@ -86,6 +86,7 @@ func (h *Handlers) RefreshHandler(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 64*1024) // 64KB
|
||||
var req RefreshRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid json body")
|
||||
@ -130,6 +131,7 @@ func (h *Handlers) LogoutHandler(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 64*1024) // 64KB
|
||||
var req LogoutRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid json body")
|
||||
|
||||
@ -24,6 +24,7 @@ func (h *Handlers) VerifyHandler(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 64*1024) // 64KB
|
||||
var req VerifyRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid json body")
|
||||
|
||||
@ -73,6 +73,7 @@ func (h *Handlers) RegisterHandler(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 64*1024) // 64KB
|
||||
var req RegisterRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid json body")
|
||||
|
||||
1
pkg/gateway/handlers/cache/delete_handler.go
vendored
1
pkg/gateway/handlers/cache/delete_handler.go
vendored
@ -41,6 +41,7 @@ func (h *CacheHandlers) DeleteHandler(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 10<<20) // 10MB
|
||||
var req DeleteRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid json body")
|
||||
|
||||
2
pkg/gateway/handlers/cache/get_handler.go
vendored
2
pkg/gateway/handlers/cache/get_handler.go
vendored
@ -43,6 +43,7 @@ func (h *CacheHandlers) GetHandler(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 10<<20) // 10MB
|
||||
var req GetRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid json body")
|
||||
@ -135,6 +136,7 @@ func (h *CacheHandlers) MultiGetHandler(w http.ResponseWriter, r *http.Request)
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 10<<20) // 10MB
|
||||
var req MultiGetRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid json body")
|
||||
|
||||
1
pkg/gateway/handlers/cache/list_handler.go
vendored
1
pkg/gateway/handlers/cache/list_handler.go
vendored
@ -40,6 +40,7 @@ func (h *CacheHandlers) ScanHandler(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 10<<20) // 10MB
|
||||
var req ScanRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid json body")
|
||||
|
||||
1
pkg/gateway/handlers/cache/set_handler.go
vendored
1
pkg/gateway/handlers/cache/set_handler.go
vendored
@ -51,6 +51,7 @@ func (h *CacheHandlers) SetHandler(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 10<<20) // 10MB
|
||||
var req PutRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid json body")
|
||||
|
||||
@ -38,6 +38,7 @@ func (h *DomainHandler) HandleAddDomain(w http.ResponseWriter, r *http.Request)
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
|
||||
var req struct {
|
||||
DeploymentName string `json:"deployment_name"`
|
||||
Domain string `json:"domain"`
|
||||
@ -156,6 +157,7 @@ func (h *DomainHandler) HandleVerifyDomain(w http.ResponseWriter, r *http.Reques
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
|
||||
var req struct {
|
||||
Domain string `json:"domain"`
|
||||
}
|
||||
|
||||
@ -75,6 +75,7 @@ func (h *ReplicaHandler) HandleSetup(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
|
||||
var req replicaSetupRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
http.Error(w, "Invalid request body", http.StatusBadRequest)
|
||||
@ -194,6 +195,7 @@ func (h *ReplicaHandler) HandleUpdate(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
|
||||
var req replicaUpdateRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
http.Error(w, "Invalid request body", http.StatusBadRequest)
|
||||
@ -338,6 +340,7 @@ func (h *ReplicaHandler) HandleTeardown(w http.ResponseWriter, r *http.Request)
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
|
||||
var req replicaTeardownRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
http.Error(w, "Invalid request body", http.StatusBadRequest)
|
||||
|
||||
@ -37,6 +37,7 @@ func (h *RollbackHandler) HandleRollback(w http.ResponseWriter, r *http.Request)
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
|
||||
var req struct {
|
||||
Name string `json:"name"`
|
||||
Version int `json:"version"`
|
||||
|
||||
@ -84,6 +84,7 @@ func (h *Handler) HandleJoin(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
|
||||
var req JoinRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
http.Error(w, "invalid request body", http.StatusBadRequest)
|
||||
|
||||
@ -86,6 +86,7 @@ func (h *SpawnHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
|
||||
var req SpawnRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeSpawnResponse(w, http.StatusBadRequest, SpawnResponse{Error: "invalid request body"})
|
||||
|
||||
@ -155,6 +155,7 @@ func (h *StatusHandler) HandleProvision(w http.ResponseWriter, r *http.Request)
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
|
||||
var req ProvisionRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid json body")
|
||||
|
||||
@ -27,6 +27,7 @@ func (p *PubSubHandlers) PublishHandler(w http.ResponseWriter, r *http.Request)
|
||||
writeError(w, http.StatusForbidden, "namespace not resolved")
|
||||
return
|
||||
}
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
|
||||
var body PublishRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil || body.Topic == "" || body.DataB64 == "" {
|
||||
writeError(w, http.StatusBadRequest, "invalid body: expected {topic,data_base64}")
|
||||
|
||||
@ -41,6 +41,7 @@ func (h *BackupHandler) BackupDatabase(w http.ResponseWriter, r *http.Request) {
|
||||
DatabaseName string `json:"database_name"`
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
http.Error(w, "Invalid request body", http.StatusBadRequest)
|
||||
return
|
||||
|
||||
@ -74,6 +74,7 @@ func (h *SQLiteHandler) CreateDatabase(w http.ResponseWriter, r *http.Request) {
|
||||
DatabaseName string `json:"database_name"`
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeCreateError(w, http.StatusBadRequest, "Invalid request body")
|
||||
return
|
||||
|
||||
@ -44,6 +44,7 @@ func (h *SQLiteHandler) QueryDatabase(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
|
||||
var req QueryRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeJSONError(w, http.StatusBadRequest, "Invalid request body")
|
||||
|
||||
@ -23,6 +23,7 @@ func (h *Handlers) PinHandler(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
|
||||
var req StoragePinRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
httputil.WriteError(w, http.StatusBadRequest, fmt.Sprintf("failed to decode request: %v", err))
|
||||
|
||||
@ -74,6 +74,7 @@ func (h *Handlers) UploadHandler(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
} else {
|
||||
// Handle JSON request with base64 data
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
|
||||
var req StorageUploadRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
httputil.WriteError(w, http.StatusBadRequest, fmt.Sprintf("failed to decode request: %v", err))
|
||||
|
||||
@ -58,6 +58,7 @@ func (h *Handler) HandleRegisterPeer(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 1<<20) // 1MB
|
||||
var req RegisterPeerRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
http.Error(w, "invalid request body", http.StatusBadRequest)
|
||||
|
||||
@ -194,15 +194,21 @@ func (hg *HTTPGateway) Start(ctx context.Context) error {
|
||||
}
|
||||
|
||||
hg.server = &http.Server{
|
||||
Addr: hg.config.ListenAddr,
|
||||
Handler: hg.router,
|
||||
Addr: hg.config.ListenAddr,
|
||||
Handler: hg.router,
|
||||
ReadHeaderTimeout: 10 * time.Second,
|
||||
ReadTimeout: 60 * time.Second,
|
||||
WriteTimeout: 120 * time.Second,
|
||||
IdleTimeout: 120 * time.Second,
|
||||
MaxHeaderBytes: 1 << 20, // 1MB
|
||||
}
|
||||
|
||||
// Listen for connections
|
||||
listener, err := net.Listen("tcp", hg.config.ListenAddr)
|
||||
// Listen for connections with a max concurrent connection limit
|
||||
rawListener, err := net.Listen("tcp", hg.config.ListenAddr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to listen on %s: %w", hg.config.ListenAddr, err)
|
||||
}
|
||||
listener := LimitedListener(rawListener, DefaultMaxConnections)
|
||||
|
||||
hg.logger.ComponentInfo(logging.ComponentGeneral, "HTTP Gateway server starting",
|
||||
zap.String("node_name", hg.config.NodeName),
|
||||
|
||||
@ -111,8 +111,13 @@ func (g *HTTPSGateway) Start(ctx context.Context) error {
|
||||
|
||||
// Start HTTP server for ACME challenge and redirect
|
||||
g.httpServer = &http.Server{
|
||||
Addr: fmt.Sprintf(":%d", httpPort),
|
||||
Handler: g.httpHandler(),
|
||||
Addr: fmt.Sprintf(":%d", httpPort),
|
||||
Handler: g.httpHandler(),
|
||||
ReadHeaderTimeout: 10 * time.Second,
|
||||
ReadTimeout: 60 * time.Second,
|
||||
WriteTimeout: 120 * time.Second,
|
||||
IdleTimeout: 120 * time.Second,
|
||||
MaxHeaderBytes: 1 << 20, // 1MB
|
||||
}
|
||||
|
||||
go func() {
|
||||
@ -143,15 +148,21 @@ func (g *HTTPSGateway) Start(ctx context.Context) error {
|
||||
|
||||
// Start HTTPS server
|
||||
g.httpsServer = &http.Server{
|
||||
Addr: fmt.Sprintf(":%d", httpsPort),
|
||||
Handler: g.router,
|
||||
TLSConfig: tlsConfig,
|
||||
Addr: fmt.Sprintf(":%d", httpsPort),
|
||||
Handler: g.router,
|
||||
TLSConfig: tlsConfig,
|
||||
ReadHeaderTimeout: 10 * time.Second,
|
||||
ReadTimeout: 60 * time.Second,
|
||||
WriteTimeout: 120 * time.Second,
|
||||
IdleTimeout: 120 * time.Second,
|
||||
MaxHeaderBytes: 1 << 20, // 1MB
|
||||
}
|
||||
|
||||
listener, err := tls.Listen("tcp", g.httpsServer.Addr, tlsConfig)
|
||||
rawListener, err := tls.Listen("tcp", g.httpsServer.Addr, tlsConfig)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create TLS listener: %w", err)
|
||||
}
|
||||
listener := LimitedListener(rawListener, DefaultMaxConnections)
|
||||
|
||||
g.logger.ComponentInfo(logging.ComponentGeneral, "HTTPS Gateway starting",
|
||||
zap.String("domain", g.httpsConfig.Domain),
|
||||
|
||||
@ -179,14 +179,15 @@ func (g *Gateway) proxyWebSocket(w http.ResponseWriter, r *http.Request, targetH
|
||||
|
||||
// withMiddleware adds CORS, security headers, rate limiting, and logging middleware
|
||||
func (g *Gateway) withMiddleware(next http.Handler) http.Handler {
|
||||
// Order: logging -> security headers -> rate limit -> CORS -> domain routing -> auth -> handler
|
||||
// Order: logging -> security headers -> rate limit -> CORS -> domain routing -> auth -> namespace rate limit -> handler
|
||||
return g.loggingMiddleware(
|
||||
g.securityHeadersMiddleware(
|
||||
g.rateLimitMiddleware(
|
||||
g.corsMiddleware(
|
||||
g.domainRoutingMiddleware(
|
||||
g.authMiddleware(
|
||||
g.authorizationMiddleware(next)))))))
|
||||
g.authorizationMiddleware(
|
||||
g.namespaceRateLimitMiddleware(next))))))))
|
||||
}
|
||||
|
||||
// securityHeadersMiddleware adds standard security headers to all responses
|
||||
@ -406,13 +407,16 @@ func extractAPIKey(r *http.Request) string {
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to query parameter (for WebSocket support)
|
||||
if v := strings.TrimSpace(r.URL.Query().Get("api_key")); v != "" {
|
||||
return v
|
||||
}
|
||||
// Also check token query parameter (alternative name)
|
||||
if v := strings.TrimSpace(r.URL.Query().Get("token")); v != "" {
|
||||
return v
|
||||
// Fallback to query parameter ONLY for WebSocket upgrade requests.
|
||||
// WebSocket clients cannot set custom headers, so query params are the
|
||||
// only way to authenticate. For regular HTTP requests, require headers.
|
||||
if isWebSocketUpgrade(r) {
|
||||
if v := strings.TrimSpace(r.URL.Query().Get("api_key")); v != "" {
|
||||
return v
|
||||
}
|
||||
if v := strings.TrimSpace(r.URL.Query().Get("token")); v != "" {
|
||||
return v
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
@ -658,13 +662,20 @@ func requiresNamespaceOwnership(p string) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// corsMiddleware applies permissive CORS headers suitable for early development
|
||||
// corsMiddleware applies CORS headers. Allows requests from the configured base
|
||||
// domain and its subdomains. Falls back to permissive "*" only if no base domain
|
||||
// is configured.
|
||||
func (g *Gateway) corsMiddleware(next http.Handler) http.Handler {
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Access-Control-Allow-Origin", "*")
|
||||
origin := r.Header.Get("Origin")
|
||||
allowedOrigin := g.getAllowedOrigin(origin)
|
||||
w.Header().Set("Access-Control-Allow-Origin", allowedOrigin)
|
||||
w.Header().Set("Access-Control-Allow-Methods", "GET, PUT, POST, DELETE, OPTIONS")
|
||||
w.Header().Set("Access-Control-Allow-Headers", "Content-Type, Authorization, X-API-Key")
|
||||
w.Header().Set("Access-Control-Max-Age", strconv.Itoa(600))
|
||||
if allowedOrigin != "*" {
|
||||
w.Header().Set("Vary", "Origin")
|
||||
}
|
||||
if r.Method == http.MethodOptions {
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
return
|
||||
@ -673,6 +684,36 @@ func (g *Gateway) corsMiddleware(next http.Handler) http.Handler {
|
||||
})
|
||||
}
|
||||
|
||||
// getAllowedOrigin returns the allowed origin for CORS based on the request origin.
|
||||
// If no base domain is configured, allows all origins (*).
|
||||
// Otherwise, allows the base domain and any subdomain of it.
|
||||
func (g *Gateway) getAllowedOrigin(origin string) string {
|
||||
if g.cfg.BaseDomain == "" {
|
||||
return "*"
|
||||
}
|
||||
if origin == "" {
|
||||
return "https://" + g.cfg.BaseDomain
|
||||
}
|
||||
// Extract hostname from origin (e.g., "https://app.dbrs.space" -> "app.dbrs.space")
|
||||
host := origin
|
||||
if idx := strings.Index(host, "://"); idx != -1 {
|
||||
host = host[idx+3:]
|
||||
}
|
||||
// Strip port if present
|
||||
if idx := strings.Index(host, ":"); idx != -1 {
|
||||
host = host[:idx]
|
||||
}
|
||||
// Allow exact match or subdomain match
|
||||
if host == g.cfg.BaseDomain || strings.HasSuffix(host, "."+g.cfg.BaseDomain) {
|
||||
return origin
|
||||
}
|
||||
// Also allow common development origins
|
||||
if host == "localhost" || host == "127.0.0.1" {
|
||||
return origin
|
||||
}
|
||||
return "https://" + g.cfg.BaseDomain
|
||||
}
|
||||
|
||||
// persistRequestLog writes request metadata to the database (best-effort)
|
||||
func (g *Gateway) persistRequestLog(r *http.Request, srw *statusResponseWriter, dur time.Duration) {
|
||||
if g.client == nil {
|
||||
@ -1024,10 +1065,19 @@ func (g *Gateway) handleNamespaceGatewayRequest(w http.ResponseWriter, r *http.R
|
||||
proxyReq.Header.Set(HeaderInternalAuthNamespace, validatedNamespace)
|
||||
}
|
||||
|
||||
// Execute proxy request
|
||||
httpClient := &http.Client{Timeout: 30 * time.Second}
|
||||
// Circuit breaker: check if target is healthy before sending request
|
||||
cbKey := "ns:" + gatewayIP
|
||||
cb := g.circuitBreakers.Get(cbKey)
|
||||
if !cb.Allow() {
|
||||
http.Error(w, "Namespace gateway unavailable (circuit open)", http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
|
||||
// Execute proxy request using shared transport for connection pooling
|
||||
httpClient := &http.Client{Timeout: 30 * time.Second, Transport: g.proxyTransport}
|
||||
resp, err := httpClient.Do(proxyReq)
|
||||
if err != nil {
|
||||
cb.RecordFailure()
|
||||
g.logger.ComponentError(logging.ComponentGeneral, "namespace gateway proxy request failed",
|
||||
zap.String("namespace", namespaceName),
|
||||
zap.String("target", gatewayIP),
|
||||
@ -1038,6 +1088,12 @@ func (g *Gateway) handleNamespaceGatewayRequest(w http.ResponseWriter, r *http.R
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if IsResponseFailure(resp.StatusCode) {
|
||||
cb.RecordFailure()
|
||||
} else {
|
||||
cb.RecordSuccess()
|
||||
}
|
||||
|
||||
// Copy response headers
|
||||
for key, values := range resp.Header {
|
||||
for _, value := range values {
|
||||
@ -1255,8 +1311,8 @@ serveLocal:
|
||||
}
|
||||
}
|
||||
|
||||
// Execute proxy request
|
||||
httpClient := &http.Client{Timeout: 30 * time.Second}
|
||||
// Execute proxy request using shared transport
|
||||
httpClient := &http.Client{Timeout: 30 * time.Second, Transport: g.proxyTransport}
|
||||
resp, err := httpClient.Do(proxyReq)
|
||||
if err != nil {
|
||||
g.logger.ComponentError(logging.ComponentGeneral, "local proxy request failed",
|
||||
@ -1354,13 +1410,19 @@ func (g *Gateway) proxyCrossNode(w http.ResponseWriter, r *http.Request, deploym
|
||||
proxyReq.Header.Set("X-Forwarded-For", getClientIP(r))
|
||||
proxyReq.Header.Set("X-Orama-Proxy-Node", g.nodePeerID) // Prevent loops
|
||||
|
||||
// Simple HTTP client for internal node-to-node communication
|
||||
httpClient := &http.Client{
|
||||
Timeout: 120 * time.Second,
|
||||
// Circuit breaker: check if target node is healthy
|
||||
cbKey := "node:" + homeIP
|
||||
cb := g.circuitBreakers.Get(cbKey)
|
||||
if !cb.Allow() {
|
||||
g.logger.Warn("Cross-node proxy skipped (circuit open)", zap.String("target_ip", homeIP))
|
||||
return false
|
||||
}
|
||||
|
||||
// Internal node-to-node communication using shared transport
|
||||
httpClient := &http.Client{Timeout: 120 * time.Second, Transport: g.proxyTransport}
|
||||
resp, err := httpClient.Do(proxyReq)
|
||||
if err != nil {
|
||||
cb.RecordFailure()
|
||||
g.logger.Error("Cross-node proxy request failed",
|
||||
zap.String("target_ip", homeIP),
|
||||
zap.String("host", r.Host),
|
||||
@ -1369,6 +1431,12 @@ func (g *Gateway) proxyCrossNode(w http.ResponseWriter, r *http.Request, deploym
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if IsResponseFailure(resp.StatusCode) {
|
||||
cb.RecordFailure()
|
||||
} else {
|
||||
cb.RecordSuccess()
|
||||
}
|
||||
|
||||
// Copy response headers
|
||||
for key, values := range resp.Header {
|
||||
for _, value := range values {
|
||||
@ -1465,9 +1533,18 @@ func (g *Gateway) proxyCrossNodeToIP(w http.ResponseWriter, r *http.Request, dep
|
||||
proxyReq.Header.Set("X-Forwarded-For", getClientIP(r))
|
||||
proxyReq.Header.Set("X-Orama-Proxy-Node", g.nodePeerID)
|
||||
|
||||
httpClient := &http.Client{Timeout: 5 * time.Second}
|
||||
// Circuit breaker: skip this replica if it's been failing
|
||||
cbKey := "node:" + nodeIP
|
||||
cb := g.circuitBreakers.Get(cbKey)
|
||||
if !cb.Allow() {
|
||||
g.logger.Warn("Replica proxy skipped (circuit open)", zap.String("target_ip", nodeIP))
|
||||
return false
|
||||
}
|
||||
|
||||
httpClient := &http.Client{Timeout: 5 * time.Second, Transport: g.proxyTransport}
|
||||
resp, err := httpClient.Do(proxyReq)
|
||||
if err != nil {
|
||||
cb.RecordFailure()
|
||||
g.logger.Warn("Replica proxy request failed",
|
||||
zap.String("target_ip", nodeIP),
|
||||
zap.Error(err),
|
||||
@ -1477,13 +1554,15 @@ func (g *Gateway) proxyCrossNodeToIP(w http.ResponseWriter, r *http.Request, dep
|
||||
defer resp.Body.Close()
|
||||
|
||||
// If the remote node returned a gateway error, try the next replica
|
||||
if resp.StatusCode == http.StatusBadGateway || resp.StatusCode == http.StatusServiceUnavailable || resp.StatusCode == http.StatusGatewayTimeout {
|
||||
if IsResponseFailure(resp.StatusCode) {
|
||||
cb.RecordFailure()
|
||||
g.logger.Warn("Replica returned gateway error, trying next",
|
||||
zap.String("target_ip", nodeIP),
|
||||
zap.Int("status", resp.StatusCode),
|
||||
)
|
||||
return false
|
||||
}
|
||||
cb.RecordSuccess()
|
||||
|
||||
for key, values := range resp.Header {
|
||||
for _, value := range values {
|
||||
|
||||
@ -82,6 +82,28 @@ func (rl *RateLimiter) StartCleanup(interval, maxAge time.Duration) {
|
||||
}()
|
||||
}
|
||||
|
||||
// NamespaceRateLimiter provides per-namespace rate limiting using a sync.Map
|
||||
// for better concurrent performance than a single mutex.
|
||||
type NamespaceRateLimiter struct {
|
||||
limiters sync.Map // namespace -> *RateLimiter
|
||||
rate int // per-minute rate per namespace
|
||||
burst int
|
||||
}
|
||||
|
||||
// NewNamespaceRateLimiter creates a per-namespace rate limiter.
|
||||
func NewNamespaceRateLimiter(ratePerMinute, burst int) *NamespaceRateLimiter {
|
||||
return &NamespaceRateLimiter{rate: ratePerMinute, burst: burst}
|
||||
}
|
||||
|
||||
// Allow checks if a request for the given namespace should be allowed.
|
||||
func (nrl *NamespaceRateLimiter) Allow(namespace string) bool {
|
||||
if namespace == "" {
|
||||
return true
|
||||
}
|
||||
val, _ := nrl.limiters.LoadOrStore(namespace, NewRateLimiter(nrl.rate, nrl.burst))
|
||||
return val.(*RateLimiter).Allow(namespace)
|
||||
}
|
||||
|
||||
// rateLimitMiddleware returns 429 when a client exceeds the rate limit.
|
||||
// Internal traffic from the WireGuard subnet (10.0.0.0/8) is exempt.
|
||||
func (g *Gateway) rateLimitMiddleware(next http.Handler) http.Handler {
|
||||
@ -106,6 +128,27 @@ func (g *Gateway) rateLimitMiddleware(next http.Handler) http.Handler {
|
||||
})
|
||||
}
|
||||
|
||||
// namespaceRateLimitMiddleware enforces per-namespace rate limits.
|
||||
// It runs after auth middleware so the namespace is available in context.
|
||||
func (g *Gateway) namespaceRateLimitMiddleware(next http.Handler) http.Handler {
|
||||
if g.namespaceRateLimiter == nil {
|
||||
return next
|
||||
}
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
// Extract namespace from context (set by auth middleware)
|
||||
if v := r.Context().Value(CtxKeyNamespaceOverride); v != nil {
|
||||
if ns, ok := v.(string); ok && ns != "" {
|
||||
if !g.namespaceRateLimiter.Allow(ns) {
|
||||
w.Header().Set("Retry-After", "60")
|
||||
http.Error(w, "namespace rate limit exceeded", http.StatusTooManyRequests)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
next.ServeHTTP(w, r)
|
||||
})
|
||||
}
|
||||
|
||||
// isInternalIP returns true if the IP is in the WireGuard 10.0.0.0/8 subnet
|
||||
// or is a loopback address.
|
||||
func isInternalIP(ipStr string) bool {
|
||||
|
||||
63
pkg/gateway/signing_key.go
Normal file
63
pkg/gateway/signing_key.go
Normal file
@ -0,0 +1,63 @@
|
||||
package gateway
|
||||
|
||||
import (
|
||||
"crypto/rand"
|
||||
"crypto/rsa"
|
||||
"crypto/x509"
|
||||
"encoding/pem"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/logging"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
const jwtKeyFileName = "jwt-signing-key.pem"
|
||||
|
||||
// loadOrCreateSigningKey loads the JWT signing key from disk, or generates a new one
|
||||
// if none exists. This ensures JWTs survive gateway restarts.
|
||||
func loadOrCreateSigningKey(dataDir string, logger *logging.ColoredLogger) ([]byte, error) {
|
||||
keyPath := filepath.Join(dataDir, "secrets", jwtKeyFileName)
|
||||
|
||||
// Try to load existing key
|
||||
if keyPEM, err := os.ReadFile(keyPath); err == nil && len(keyPEM) > 0 {
|
||||
// Verify the key is valid
|
||||
block, _ := pem.Decode(keyPEM)
|
||||
if block != nil {
|
||||
if _, err := x509.ParsePKCS1PrivateKey(block.Bytes); err == nil {
|
||||
logger.ComponentInfo(logging.ComponentGeneral, "Loaded existing JWT signing key",
|
||||
zap.String("path", keyPath))
|
||||
return keyPEM, nil
|
||||
}
|
||||
}
|
||||
logger.ComponentWarn(logging.ComponentGeneral, "Existing JWT signing key is invalid, generating new one",
|
||||
zap.String("path", keyPath))
|
||||
}
|
||||
|
||||
// Generate new key
|
||||
key, err := rsa.GenerateKey(rand.Reader, 2048)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("generate RSA key: %w", err)
|
||||
}
|
||||
|
||||
keyPEM := pem.EncodeToMemory(&pem.Block{
|
||||
Type: "RSA PRIVATE KEY",
|
||||
Bytes: x509.MarshalPKCS1PrivateKey(key),
|
||||
})
|
||||
|
||||
// Ensure secrets directory exists
|
||||
secretsDir := filepath.Dir(keyPath)
|
||||
if err := os.MkdirAll(secretsDir, 0700); err != nil {
|
||||
return nil, fmt.Errorf("create secrets directory: %w", err)
|
||||
}
|
||||
|
||||
// Write key with restrictive permissions
|
||||
if err := os.WriteFile(keyPath, keyPEM, 0600); err != nil {
|
||||
return nil, fmt.Errorf("write signing key: %w", err)
|
||||
}
|
||||
|
||||
logger.ComponentInfo(logging.ComponentGeneral, "Generated and saved new JWT signing key",
|
||||
zap.String("path", keyPath))
|
||||
return keyPEM, nil
|
||||
}
|
||||
@ -130,8 +130,13 @@ func (n *Node) startHTTPGateway(ctx context.Context) error {
|
||||
|
||||
go func() {
|
||||
server := &http.Server{
|
||||
Addr: gwCfg.ListenAddr,
|
||||
Handler: apiGateway.Routes(),
|
||||
Addr: gwCfg.ListenAddr,
|
||||
Handler: apiGateway.Routes(),
|
||||
ReadHeaderTimeout: 10 * time.Second,
|
||||
ReadTimeout: 60 * time.Second,
|
||||
WriteTimeout: 120 * time.Second,
|
||||
IdleTimeout: 120 * time.Second,
|
||||
MaxHeaderBytes: 1 << 20, // 1MB
|
||||
}
|
||||
n.apiGatewayServer = server
|
||||
|
||||
|
||||
199
pkg/rqlite/backup.go
Normal file
199
pkg/rqlite/backup.go
Normal file
@ -0,0 +1,199 @@
|
||||
package rqlite
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultBackupInterval = 1 * time.Hour
|
||||
maxBackupRetention = 24
|
||||
backupDirName = "backups/rqlite"
|
||||
backupPrefix = "rqlite-backup-"
|
||||
backupSuffix = ".db"
|
||||
backupTimestampFormat = "20060102-150405"
|
||||
)
|
||||
|
||||
// startBackupLoop runs a periodic backup of the RQLite database.
|
||||
// It saves consistent SQLite snapshots to the local backup directory.
|
||||
// Only the leader node performs backups; followers skip silently.
|
||||
func (r *RQLiteManager) startBackupLoop(ctx context.Context) {
|
||||
interval := r.config.BackupInterval
|
||||
if interval <= 0 {
|
||||
interval = defaultBackupInterval
|
||||
}
|
||||
|
||||
r.logger.Info("RQLite backup loop started",
|
||||
zap.Duration("interval", interval),
|
||||
zap.Int("max_retention", maxBackupRetention))
|
||||
|
||||
// Wait before the first backup to let the cluster stabilize
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-time.After(interval):
|
||||
}
|
||||
|
||||
ticker := time.NewTicker(interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
// Run the first backup immediately after the initial wait
|
||||
r.performBackup()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
r.logger.Info("RQLite backup loop stopped")
|
||||
return
|
||||
case <-ticker.C:
|
||||
r.performBackup()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// performBackup executes a single backup cycle: check leadership, take snapshot, prune old backups.
|
||||
func (r *RQLiteManager) performBackup() {
|
||||
// Only the leader should perform backups to avoid duplicate work
|
||||
if !r.isLeaderNode() {
|
||||
r.logger.Debug("Skipping backup: this node is not the leader")
|
||||
return
|
||||
}
|
||||
|
||||
backupDir := r.backupDir()
|
||||
if err := os.MkdirAll(backupDir, 0755); err != nil {
|
||||
r.logger.Error("Failed to create backup directory",
|
||||
zap.String("dir", backupDir),
|
||||
zap.Error(err))
|
||||
return
|
||||
}
|
||||
|
||||
timestamp := time.Now().UTC().Format(backupTimestampFormat)
|
||||
filename := fmt.Sprintf("%s%s%s", backupPrefix, timestamp, backupSuffix)
|
||||
backupPath := filepath.Join(backupDir, filename)
|
||||
|
||||
if err := r.downloadBackup(backupPath); err != nil {
|
||||
r.logger.Error("Failed to download RQLite backup",
|
||||
zap.String("path", backupPath),
|
||||
zap.Error(err))
|
||||
// Clean up partial file
|
||||
_ = os.Remove(backupPath)
|
||||
return
|
||||
}
|
||||
|
||||
info, err := os.Stat(backupPath)
|
||||
if err != nil {
|
||||
r.logger.Error("Failed to stat backup file",
|
||||
zap.String("path", backupPath),
|
||||
zap.Error(err))
|
||||
return
|
||||
}
|
||||
|
||||
r.logger.Info("RQLite backup completed",
|
||||
zap.String("path", backupPath),
|
||||
zap.Int64("size_bytes", info.Size()))
|
||||
|
||||
r.pruneOldBackups(backupDir)
|
||||
}
|
||||
|
||||
// isLeaderNode checks whether this node is currently the Raft leader.
|
||||
func (r *RQLiteManager) isLeaderNode() bool {
|
||||
status, err := r.getRQLiteStatus()
|
||||
if err != nil {
|
||||
r.logger.Debug("Cannot determine leader status, skipping backup", zap.Error(err))
|
||||
return false
|
||||
}
|
||||
return status.Store.Raft.State == "Leader"
|
||||
}
|
||||
|
||||
// backupDir returns the path to the backup directory.
|
||||
func (r *RQLiteManager) backupDir() string {
|
||||
return filepath.Join(r.dataDir, backupDirName)
|
||||
}
|
||||
|
||||
// downloadBackup calls the RQLite backup API and writes the SQLite snapshot to disk.
|
||||
func (r *RQLiteManager) downloadBackup(destPath string) error {
|
||||
url := fmt.Sprintf("http://localhost:%d/db/backup", r.config.RQLitePort)
|
||||
client := &http.Client{Timeout: 2 * time.Minute}
|
||||
|
||||
resp, err := client.Get(url)
|
||||
if err != nil {
|
||||
return fmt.Errorf("request backup endpoint: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("backup endpoint returned %d: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
outFile, err := os.Create(destPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("create backup file: %w", err)
|
||||
}
|
||||
defer outFile.Close()
|
||||
|
||||
written, err := io.Copy(outFile, resp.Body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("write backup data: %w", err)
|
||||
}
|
||||
|
||||
if written == 0 {
|
||||
return fmt.Errorf("backup file is empty")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// pruneOldBackups removes the oldest backup files, keeping only the most recent maxBackupRetention.
|
||||
func (r *RQLiteManager) pruneOldBackups(backupDir string) {
|
||||
entries, err := os.ReadDir(backupDir)
|
||||
if err != nil {
|
||||
r.logger.Error("Failed to list backup directory for pruning",
|
||||
zap.String("dir", backupDir),
|
||||
zap.Error(err))
|
||||
return
|
||||
}
|
||||
|
||||
// Collect only backup files matching our naming convention
|
||||
var backupFiles []os.DirEntry
|
||||
for _, entry := range entries {
|
||||
if !entry.IsDir() && strings.HasPrefix(entry.Name(), backupPrefix) && strings.HasSuffix(entry.Name(), backupSuffix) {
|
||||
backupFiles = append(backupFiles, entry)
|
||||
}
|
||||
}
|
||||
|
||||
if len(backupFiles) <= maxBackupRetention {
|
||||
return
|
||||
}
|
||||
|
||||
// Sort by name ascending (timestamp in name ensures chronological order)
|
||||
sort.Slice(backupFiles, func(i, j int) bool {
|
||||
return backupFiles[i].Name() < backupFiles[j].Name()
|
||||
})
|
||||
|
||||
// Remove the oldest files beyond the retention limit
|
||||
toDelete := backupFiles[:len(backupFiles)-maxBackupRetention]
|
||||
for _, entry := range toDelete {
|
||||
path := filepath.Join(backupDir, entry.Name())
|
||||
if err := os.Remove(path); err != nil {
|
||||
r.logger.Warn("Failed to delete old backup",
|
||||
zap.String("path", path),
|
||||
zap.Error(err))
|
||||
} else {
|
||||
r.logger.Debug("Pruned old backup", zap.String("path", path))
|
||||
}
|
||||
}
|
||||
|
||||
r.logger.Info("Pruned old backups",
|
||||
zap.Int("deleted", len(toDelete)),
|
||||
zap.Int("remaining", maxBackupRetention))
|
||||
}
|
||||
@ -119,17 +119,15 @@ func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rql
|
||||
time.Sleep(2 * time.Second)
|
||||
}
|
||||
|
||||
// Even if we only discovered ourselves, write peers.json as a fallback
|
||||
// This ensures RQLite has consistent state and can potentially recover
|
||||
// when other nodes come online
|
||||
// If we only discovered ourselves, do NOT write a single-node peers.json.
|
||||
// Writing single-node peers.json causes RQLite to bootstrap as a solo cluster,
|
||||
// making it impossible to rejoin the actual cluster later (-join fails with
|
||||
// "single-node cluster, joining not supported"). Let RQLite start with its
|
||||
// existing Raft state or use the -join flag to connect.
|
||||
if discoveredPeers <= 1 {
|
||||
r.logger.Warn("Only discovered self during pre-start discovery, writing single-node peers.json as fallback",
|
||||
r.logger.Warn("Only discovered self during pre-start discovery, skipping peers.json write to prevent solo bootstrap",
|
||||
zap.Int("discovered_peers", discoveredPeers),
|
||||
zap.Int("min_cluster_size", r.config.MinClusterSize))
|
||||
// Still write peers.json with just ourselves - better than nothing
|
||||
if err := r.discoveryService.ForceWritePeersJSON(); err != nil {
|
||||
r.logger.Warn("Failed to write single-node peers.json fallback", zap.Error(err))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@ -145,6 +145,11 @@ func (r *RQLiteManager) launchProcess(ctx context.Context, rqliteDataDir string)
|
||||
return fmt.Errorf("failed to start RQLite: %w", err)
|
||||
}
|
||||
|
||||
// Write PID file for reliable orphan detection
|
||||
pidPath := filepath.Join(logsDir, "rqlited.pid")
|
||||
_ = os.WriteFile(pidPath, []byte(fmt.Sprintf("%d", r.cmd.Process.Pid)), 0644)
|
||||
r.logger.Info("RQLite process started", zap.Int("pid", r.cmd.Process.Pid), zap.String("pid_file", pidPath))
|
||||
|
||||
logFile.Close()
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -3,6 +3,7 @@ package rqlite
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"syscall"
|
||||
"time"
|
||||
@ -71,6 +72,12 @@ func (r *RQLiteManager) Start(ctx context.Context) error {
|
||||
go r.startVoterReconciliation(ctx)
|
||||
}
|
||||
|
||||
// Start child process watchdog to detect and recover from crashes
|
||||
go r.startProcessWatchdog(ctx)
|
||||
|
||||
// Start periodic RQLite backup loop (leader-only, self-checking)
|
||||
go r.startBackupLoop(ctx)
|
||||
|
||||
if err := r.establishLeadershipOrJoin(ctx, rqliteDataDir); err != nil {
|
||||
return err
|
||||
}
|
||||
@ -92,7 +99,9 @@ func (r *RQLiteManager) GetConnection() *gorqlite.Connection {
|
||||
return r.connection
|
||||
}
|
||||
|
||||
// Stop stops the RQLite node
|
||||
// Stop stops the RQLite node gracefully.
|
||||
// If this node is the Raft leader, it attempts a leadership transfer first
|
||||
// to minimize cluster disruption.
|
||||
func (r *RQLiteManager) Stop() error {
|
||||
if r.connection != nil {
|
||||
r.connection.Close()
|
||||
@ -103,16 +112,52 @@ func (r *RQLiteManager) Stop() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Attempt leadership transfer if we are the leader
|
||||
r.transferLeadershipIfLeader()
|
||||
|
||||
_ = r.cmd.Process.Signal(syscall.SIGTERM)
|
||||
|
||||
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- r.cmd.Wait() }()
|
||||
|
||||
// Give RQLite 30s to flush pending writes and shut down gracefully
|
||||
// (previously 5s which risked Raft log corruption)
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(5 * time.Second):
|
||||
case <-time.After(30 * time.Second):
|
||||
r.logger.Warn("RQLite did not stop within 30s, sending SIGKILL")
|
||||
_ = r.cmd.Process.Kill()
|
||||
}
|
||||
|
||||
// Clean up PID file
|
||||
r.cleanupPIDFile()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// transferLeadershipIfLeader checks if this node is the Raft leader and
|
||||
// requests a leadership transfer to minimize election disruption.
|
||||
func (r *RQLiteManager) transferLeadershipIfLeader() {
|
||||
status, err := r.getRQLiteStatus()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if status.Store.Raft.State != "Leader" {
|
||||
return
|
||||
}
|
||||
|
||||
r.logger.Info("This node is the Raft leader, requesting leadership transfer before shutdown")
|
||||
|
||||
// RQLite doesn't have a direct leadership transfer API, but we can
|
||||
// signal readiness to step down. The fastest approach is to let the
|
||||
// SIGTERM handler in rqlited handle this — rqlite v8 gracefully
|
||||
// steps down on SIGTERM when possible. We log the state for visibility.
|
||||
r.logger.Info("Leader will transfer on SIGTERM (rqlite built-in behavior)")
|
||||
}
|
||||
|
||||
// cleanupPIDFile removes the PID file on shutdown
|
||||
func (r *RQLiteManager) cleanupPIDFile() {
|
||||
logsDir := fmt.Sprintf("%s/../logs", r.dataDir)
|
||||
pidPath := logsDir + "/rqlited.pid"
|
||||
_ = os.Remove(pidPath)
|
||||
}
|
||||
|
||||
@ -36,16 +36,16 @@ func (r *RQLiteManager) prepareDataDir() (string, error) {
|
||||
}
|
||||
|
||||
func (r *RQLiteManager) hasExistingState(rqliteDataDir string) bool {
|
||||
entries, err := os.ReadDir(rqliteDataDir)
|
||||
// Check specifically for raft.db with non-trivial content.
|
||||
// Previously this checked for ANY file in the data dir, which was too broad —
|
||||
// auto-discovery creates peers.json and log files before RQLite starts,
|
||||
// causing false positives that skip the -join flag on restart.
|
||||
raftDB := filepath.Join(rqliteDataDir, "raft.db")
|
||||
info, err := os.Stat(raftDB)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
for _, e := range entries {
|
||||
if e.Name() != "." && e.Name() != ".." {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
return info.Size() > 1024
|
||||
}
|
||||
|
||||
func (r *RQLiteManager) exponentialBackoff(attempt int, baseDelay time.Duration, maxDelay time.Duration) time.Duration {
|
||||
|
||||
@ -76,14 +76,24 @@ func TestHasExistingState(t *testing.T) {
|
||||
t.Errorf("hasExistingState() = true; want false for empty dir")
|
||||
}
|
||||
|
||||
// Test directory with a file
|
||||
// Test directory with only non-raft files (should still be false)
|
||||
testFile := filepath.Join(tmpDir, "test.txt")
|
||||
if err := os.WriteFile(testFile, []byte("data"), 0644); err != nil {
|
||||
t.Fatalf("failed to create test file: %v", err)
|
||||
}
|
||||
|
||||
if r.hasExistingState(tmpDir) {
|
||||
t.Errorf("hasExistingState() = true; want false for dir with only non-raft files")
|
||||
}
|
||||
|
||||
// Test directory with raft.db (should be true)
|
||||
raftDB := filepath.Join(tmpDir, "raft.db")
|
||||
if err := os.WriteFile(raftDB, make([]byte, 2048), 0644); err != nil {
|
||||
t.Fatalf("failed to create raft.db: %v", err)
|
||||
}
|
||||
|
||||
if !r.hasExistingState(tmpDir) {
|
||||
t.Errorf("hasExistingState() = false; want true for non-empty dir")
|
||||
t.Errorf("hasExistingState() = false; want true for dir with raft.db")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -132,21 +132,67 @@ func (r *RQLiteManager) reconcileVoters() error {
|
||||
// cluster and immediately re-adding it with the desired voter flag.
|
||||
// This is necessary because RQLite's /join endpoint ignores voter flag changes
|
||||
// for nodes that are already cluster members with the same ID and address.
|
||||
//
|
||||
// Safety improvements:
|
||||
// - Pre-check: verify quorum would survive the temporary removal
|
||||
// - Rollback: if rejoin fails, attempt to re-add with original status
|
||||
// - Retry: attempt rejoin up to 3 times with backoff
|
||||
func (r *RQLiteManager) changeNodeVoterStatus(nodeID string, voter bool) error {
|
||||
// Pre-check: if demoting a voter, verify quorum safety
|
||||
if !voter {
|
||||
nodes, err := r.getAllClusterNodes()
|
||||
if err != nil {
|
||||
return fmt.Errorf("quorum pre-check: %w", err)
|
||||
}
|
||||
voterCount := 0
|
||||
for _, n := range nodes {
|
||||
if n.Voter && n.Reachable {
|
||||
voterCount++
|
||||
}
|
||||
}
|
||||
// After removing this voter, we need (voterCount-1)/2 + 1 for quorum
|
||||
// which means voterCount-1 > (voterCount-1)/2, i.e., voterCount >= 3
|
||||
if voterCount <= 2 {
|
||||
return fmt.Errorf("cannot remove voter: only %d reachable voters, quorum would be lost", voterCount)
|
||||
}
|
||||
}
|
||||
|
||||
// Step 1: Remove the node from the cluster
|
||||
if err := r.removeClusterNode(nodeID); err != nil {
|
||||
return fmt.Errorf("remove node: %w", err)
|
||||
}
|
||||
|
||||
// Brief pause for Raft to commit the configuration change
|
||||
time.Sleep(2 * time.Second)
|
||||
// Wait for Raft to commit the configuration change, then rejoin with retries
|
||||
var lastErr error
|
||||
for attempt := 0; attempt < 3; attempt++ {
|
||||
waitTime := time.Duration(2+attempt*2) * time.Second // 2s, 4s, 6s
|
||||
time.Sleep(waitTime)
|
||||
|
||||
// Step 2: Re-add with the correct voter status
|
||||
if err := r.joinClusterNode(nodeID, nodeID, voter); err != nil {
|
||||
return fmt.Errorf("rejoin node: %w", err)
|
||||
if err := r.joinClusterNode(nodeID, nodeID, voter); err != nil {
|
||||
lastErr = err
|
||||
r.logger.Warn("Rejoin attempt failed, retrying",
|
||||
zap.String("node_id", nodeID),
|
||||
zap.Int("attempt", attempt+1),
|
||||
zap.Error(err))
|
||||
continue
|
||||
}
|
||||
return nil // Success
|
||||
}
|
||||
|
||||
return nil
|
||||
// All rejoin attempts failed — try to re-add with the ORIGINAL status as rollback
|
||||
r.logger.Error("All rejoin attempts failed, attempting rollback",
|
||||
zap.String("node_id", nodeID),
|
||||
zap.Bool("desired_voter", voter),
|
||||
zap.Error(lastErr))
|
||||
|
||||
originalVoter := !voter
|
||||
if err := r.joinClusterNode(nodeID, nodeID, originalVoter); err != nil {
|
||||
r.logger.Error("Rollback also failed — node may be orphaned from cluster",
|
||||
zap.String("node_id", nodeID),
|
||||
zap.Error(err))
|
||||
}
|
||||
|
||||
return fmt.Errorf("rejoin node after 3 attempts: %w", lastErr)
|
||||
}
|
||||
|
||||
// getAllClusterNodes queries /nodes?nonvoters&ver=2 to get all cluster members
|
||||
|
||||
99
pkg/rqlite/watchdog.go
Normal file
99
pkg/rqlite/watchdog.go
Normal file
@ -0,0 +1,99 @@
|
||||
package rqlite
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
const (
|
||||
watchdogInterval = 30 * time.Second
|
||||
watchdogMaxRestart = 3
|
||||
)
|
||||
|
||||
// startProcessWatchdog monitors the RQLite child process and restarts it if it crashes.
|
||||
// It checks both process liveness and HTTP responsiveness.
|
||||
func (r *RQLiteManager) startProcessWatchdog(ctx context.Context) {
|
||||
ticker := time.NewTicker(watchdogInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
restartCount := 0
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
if !r.isProcessAlive() {
|
||||
r.logger.Error("RQLite process has died",
|
||||
zap.Int("restart_count", restartCount),
|
||||
zap.Int("max_restarts", watchdogMaxRestart))
|
||||
|
||||
if restartCount >= watchdogMaxRestart {
|
||||
r.logger.Error("RQLite process watchdog: max restart attempts reached, giving up")
|
||||
return
|
||||
}
|
||||
|
||||
if err := r.restartProcess(ctx); err != nil {
|
||||
r.logger.Error("Failed to restart RQLite process", zap.Error(err))
|
||||
restartCount++
|
||||
continue
|
||||
}
|
||||
|
||||
restartCount++
|
||||
r.logger.Info("RQLite process restarted by watchdog",
|
||||
zap.Int("restart_count", restartCount))
|
||||
} else {
|
||||
// Process is alive — check HTTP responsiveness
|
||||
if !r.isHTTPResponsive() {
|
||||
r.logger.Warn("RQLite process is alive but not responding to HTTP")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// isProcessAlive checks if the RQLite child process is still running
|
||||
func (r *RQLiteManager) isProcessAlive() bool {
|
||||
if r.cmd == nil || r.cmd.Process == nil {
|
||||
return false
|
||||
}
|
||||
// On Unix, sending signal 0 checks process existence without actually signaling
|
||||
if err := r.cmd.Process.Signal(nil); err != nil {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// isHTTPResponsive checks if RQLite is responding to HTTP status requests
|
||||
func (r *RQLiteManager) isHTTPResponsive() bool {
|
||||
url := fmt.Sprintf("http://localhost:%d/status", r.config.RQLitePort)
|
||||
client := &http.Client{Timeout: 5 * time.Second}
|
||||
resp, err := client.Get(url)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
return resp.StatusCode == http.StatusOK
|
||||
}
|
||||
|
||||
// restartProcess attempts to restart the RQLite process
|
||||
func (r *RQLiteManager) restartProcess(ctx context.Context) error {
|
||||
rqliteDataDir, err := r.rqliteDataDirPath()
|
||||
if err != nil {
|
||||
return fmt.Errorf("get data dir: %w", err)
|
||||
}
|
||||
|
||||
if err := r.launchProcess(ctx, rqliteDataDir); err != nil {
|
||||
return fmt.Errorf("launch process: %w", err)
|
||||
}
|
||||
|
||||
if err := r.waitForReadyAndConnect(ctx); err != nil {
|
||||
return fmt.Errorf("wait for ready: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@ -27,7 +27,16 @@ StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=debros-gateway-%i
|
||||
|
||||
# Security hardening
|
||||
NoNewPrivileges=yes
|
||||
ProtectSystem=strict
|
||||
ProtectHome=read-only
|
||||
ProtectKernelTunables=yes
|
||||
ProtectKernelModules=yes
|
||||
ReadWritePaths=/home/debros/.orama/data/namespaces
|
||||
|
||||
LimitNOFILE=65536
|
||||
MemoryMax=1G
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
@ -27,7 +27,16 @@ StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=debros-olric-%i
|
||||
|
||||
# Security hardening
|
||||
NoNewPrivileges=yes
|
||||
ProtectSystem=strict
|
||||
ProtectHome=read-only
|
||||
ProtectKernelTunables=yes
|
||||
ProtectKernelModules=yes
|
||||
ReadWritePaths=/home/debros/.orama/data/namespaces
|
||||
|
||||
LimitNOFILE=65536
|
||||
MemoryMax=2G
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
@ -37,8 +37,17 @@ StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=debros-rqlite-%i
|
||||
|
||||
# Security hardening
|
||||
NoNewPrivileges=yes
|
||||
ProtectSystem=strict
|
||||
ProtectHome=read-only
|
||||
ProtectKernelTunables=yes
|
||||
ProtectKernelModules=yes
|
||||
ReadWritePaths=/home/debros/.orama/data/namespaces
|
||||
|
||||
# Resource limits
|
||||
LimitNOFILE=65536
|
||||
MemoryMax=2G
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user