network/pkg/gateway/instance_spawner.go

470 lines
13 KiB
Go

package gateway
import (
"context"
"fmt"
"net/http"
"os"
"os/exec"
"path/filepath"
"strings"
"sync"
"time"
"github.com/DeBrosOfficial/network/pkg/tlsutil"
"go.uber.org/zap"
"gopkg.in/yaml.v3"
)
// InstanceNodeStatus represents the status of an instance (local type to avoid import cycle)
type InstanceNodeStatus string
const (
InstanceStatusPending InstanceNodeStatus = "pending"
InstanceStatusStarting InstanceNodeStatus = "starting"
InstanceStatusRunning InstanceNodeStatus = "running"
InstanceStatusStopped InstanceNodeStatus = "stopped"
InstanceStatusFailed InstanceNodeStatus = "failed"
)
// InstanceError represents an error during instance operations (local type to avoid import cycle)
type InstanceError struct {
Message string
Cause error
}
func (e *InstanceError) Error() string {
if e.Cause != nil {
return e.Message + ": " + e.Cause.Error()
}
return e.Message
}
func (e *InstanceError) Unwrap() error {
return e.Cause
}
// InstanceSpawner manages multiple Gateway instances for namespace clusters.
// Each namespace gets its own gateway instances that connect to its dedicated RQLite and Olric clusters.
type InstanceSpawner struct {
logger *zap.Logger
baseDir string // Base directory for all namespace data (e.g., ~/.orama/data/namespaces)
instances map[string]*GatewayInstance
mu sync.RWMutex
}
// GatewayInstance represents a running Gateway instance for a namespace
type GatewayInstance struct {
Namespace string
NodeID string
HTTPPort int
BaseDomain string
RQLiteDSN string // Connection to namespace RQLite
OlricServers []string // Connection to namespace Olric
ConfigPath string
PID int
Status InstanceNodeStatus
StartedAt time.Time
LastHealthCheck time.Time
cmd *exec.Cmd
logger *zap.Logger
}
// InstanceConfig holds configuration for spawning a Gateway instance
type InstanceConfig struct {
Namespace string // Namespace name (e.g., "alice")
NodeID string // Physical node ID
HTTPPort int // HTTP API port
BaseDomain string // Base domain (e.g., "devnet-orama.network")
RQLiteDSN string // RQLite connection DSN (e.g., "http://localhost:10000")
OlricServers []string // Olric server addresses
NodePeerID string // Physical node's peer ID for home node management
DataDir string // Data directory for deployments, SQLite, etc.
}
// GatewayYAMLConfig represents the gateway YAML configuration structure
type GatewayYAMLConfig struct {
ListenAddr string `yaml:"listen_addr"`
ClientNamespace string `yaml:"client_namespace"`
RQLiteDSN string `yaml:"rqlite_dsn"`
OlricServers []string `yaml:"olric_servers"`
BaseDomain string `yaml:"base_domain"`
NodePeerID string `yaml:"node_peer_id"`
DataDir string `yaml:"data_dir"`
}
// NewInstanceSpawner creates a new Gateway instance spawner
func NewInstanceSpawner(baseDir string, logger *zap.Logger) *InstanceSpawner {
return &InstanceSpawner{
logger: logger.With(zap.String("component", "gateway-instance-spawner")),
baseDir: baseDir,
instances: make(map[string]*GatewayInstance),
}
}
// instanceKey generates a unique key for an instance based on namespace and node
func instanceKey(ns, nodeID string) string {
return fmt.Sprintf("%s:%s", ns, nodeID)
}
// SpawnInstance starts a new Gateway instance for a namespace on a specific node.
// Returns the instance info or an error if spawning fails.
func (is *InstanceSpawner) SpawnInstance(ctx context.Context, cfg InstanceConfig) (*GatewayInstance, error) {
key := instanceKey(cfg.Namespace, cfg.NodeID)
is.mu.Lock()
if existing, ok := is.instances[key]; ok {
is.mu.Unlock()
// Instance already exists, return it if running
if existing.Status == InstanceStatusRunning {
return existing, nil
}
// Otherwise, remove it and start fresh
is.mu.Lock()
delete(is.instances, key)
}
is.mu.Unlock()
// Create config and logs directories
configDir := filepath.Join(is.baseDir, cfg.Namespace, "configs")
logsDir := filepath.Join(is.baseDir, cfg.Namespace, "logs")
dataDir := filepath.Join(is.baseDir, cfg.Namespace, "data")
for _, dir := range []string{configDir, logsDir, dataDir} {
if err := os.MkdirAll(dir, 0755); err != nil {
return nil, &InstanceError{
Message: fmt.Sprintf("failed to create directory %s", dir),
Cause: err,
}
}
}
// Generate config file
configPath := filepath.Join(configDir, fmt.Sprintf("gateway-%s.yaml", cfg.NodeID))
if err := is.generateConfig(configPath, cfg, dataDir); err != nil {
return nil, err
}
instance := &GatewayInstance{
Namespace: cfg.Namespace,
NodeID: cfg.NodeID,
HTTPPort: cfg.HTTPPort,
BaseDomain: cfg.BaseDomain,
RQLiteDSN: cfg.RQLiteDSN,
OlricServers: cfg.OlricServers,
ConfigPath: configPath,
Status: InstanceStatusStarting,
logger: is.logger.With(zap.String("namespace", cfg.Namespace), zap.String("node_id", cfg.NodeID)),
}
instance.logger.Info("Starting Gateway instance",
zap.Int("http_port", cfg.HTTPPort),
zap.String("rqlite_dsn", cfg.RQLiteDSN),
zap.Strings("olric_servers", cfg.OlricServers),
)
// Find the gateway binary (should be in same directory as the current process or PATH)
gatewayBinary := "gateway"
// Create command
cmd := exec.CommandContext(ctx, gatewayBinary, "--config", configPath)
instance.cmd = cmd
// Setup logging
logPath := filepath.Join(logsDir, fmt.Sprintf("gateway-%s.log", cfg.NodeID))
logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
if err != nil {
return nil, &InstanceError{
Message: "failed to open log file",
Cause: err,
}
}
cmd.Stdout = logFile
cmd.Stderr = logFile
// Start the process
if err := cmd.Start(); err != nil {
logFile.Close()
return nil, &InstanceError{
Message: "failed to start Gateway process",
Cause: err,
}
}
logFile.Close()
instance.PID = cmd.Process.Pid
instance.StartedAt = time.Now()
// Store instance
is.mu.Lock()
is.instances[key] = instance
is.mu.Unlock()
// Wait for instance to be ready
if err := is.waitForInstanceReady(ctx, instance); err != nil {
// Kill the process on failure
if cmd.Process != nil {
_ = cmd.Process.Kill()
}
is.mu.Lock()
delete(is.instances, key)
is.mu.Unlock()
return nil, &InstanceError{
Message: "Gateway instance did not become ready",
Cause: err,
}
}
instance.Status = InstanceStatusRunning
instance.LastHealthCheck = time.Now()
instance.logger.Info("Gateway instance started successfully",
zap.Int("pid", instance.PID),
)
// Start background process monitor
go is.monitorInstance(instance)
return instance, nil
}
// generateConfig generates the Gateway YAML configuration file
func (is *InstanceSpawner) generateConfig(configPath string, cfg InstanceConfig, dataDir string) error {
gatewayCfg := GatewayYAMLConfig{
ListenAddr: fmt.Sprintf(":%d", cfg.HTTPPort),
ClientNamespace: cfg.Namespace,
RQLiteDSN: cfg.RQLiteDSN,
OlricServers: cfg.OlricServers,
BaseDomain: cfg.BaseDomain,
NodePeerID: cfg.NodePeerID,
DataDir: dataDir,
}
data, err := yaml.Marshal(gatewayCfg)
if err != nil {
return &InstanceError{
Message: "failed to marshal Gateway config",
Cause: err,
}
}
if err := os.WriteFile(configPath, data, 0644); err != nil {
return &InstanceError{
Message: "failed to write Gateway config",
Cause: err,
}
}
return nil
}
// StopInstance stops a Gateway instance for a namespace on a specific node
func (is *InstanceSpawner) StopInstance(ctx context.Context, ns, nodeID string) error {
key := instanceKey(ns, nodeID)
is.mu.Lock()
instance, ok := is.instances[key]
if !ok {
is.mu.Unlock()
return nil // Already stopped
}
delete(is.instances, key)
is.mu.Unlock()
if instance.cmd != nil && instance.cmd.Process != nil {
instance.logger.Info("Stopping Gateway instance", zap.Int("pid", instance.PID))
// Send SIGTERM for graceful shutdown
if err := instance.cmd.Process.Signal(os.Interrupt); err != nil {
// If SIGTERM fails, kill it
_ = instance.cmd.Process.Kill()
}
// Wait for process to exit with timeout
done := make(chan error, 1)
go func() {
done <- instance.cmd.Wait()
}()
select {
case <-done:
instance.logger.Info("Gateway instance stopped gracefully")
case <-time.After(10 * time.Second):
instance.logger.Warn("Gateway instance did not stop gracefully, killing")
_ = instance.cmd.Process.Kill()
case <-ctx.Done():
_ = instance.cmd.Process.Kill()
return ctx.Err()
}
}
instance.Status = InstanceStatusStopped
return nil
}
// StopAllInstances stops all Gateway instances for a namespace
func (is *InstanceSpawner) StopAllInstances(ctx context.Context, ns string) error {
is.mu.RLock()
var keys []string
for key, inst := range is.instances {
if inst.Namespace == ns {
keys = append(keys, key)
}
}
is.mu.RUnlock()
var lastErr error
for _, key := range keys {
parts := strings.SplitN(key, ":", 2)
if len(parts) == 2 {
if err := is.StopInstance(ctx, parts[0], parts[1]); err != nil {
lastErr = err
}
}
}
return lastErr
}
// GetInstance returns the instance for a namespace on a specific node
func (is *InstanceSpawner) GetInstance(ns, nodeID string) (*GatewayInstance, bool) {
is.mu.RLock()
defer is.mu.RUnlock()
instance, ok := is.instances[instanceKey(ns, nodeID)]
return instance, ok
}
// GetNamespaceInstances returns all instances for a namespace
func (is *InstanceSpawner) GetNamespaceInstances(ns string) []*GatewayInstance {
is.mu.RLock()
defer is.mu.RUnlock()
var instances []*GatewayInstance
for _, inst := range is.instances {
if inst.Namespace == ns {
instances = append(instances, inst)
}
}
return instances
}
// HealthCheck checks if an instance is healthy
func (is *InstanceSpawner) HealthCheck(ctx context.Context, ns, nodeID string) (bool, error) {
instance, ok := is.GetInstance(ns, nodeID)
if !ok {
return false, &InstanceError{Message: "instance not found"}
}
healthy, err := instance.IsHealthy(ctx)
if healthy {
is.mu.Lock()
instance.LastHealthCheck = time.Now()
is.mu.Unlock()
}
return healthy, err
}
// waitForInstanceReady waits for the Gateway instance to be ready
func (is *InstanceSpawner) waitForInstanceReady(ctx context.Context, instance *GatewayInstance) error {
client := tlsutil.NewHTTPClient(2 * time.Second)
// Gateway health check endpoint
url := fmt.Sprintf("http://localhost:%d/v1/health", instance.HTTPPort)
maxAttempts := 120 // 2 minutes
for i := 0; i < maxAttempts; i++ {
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(1 * time.Second):
}
resp, err := client.Get(url)
if err != nil {
continue
}
resp.Body.Close()
if resp.StatusCode == http.StatusOK {
instance.logger.Debug("Gateway instance ready",
zap.Int("attempts", i+1),
)
return nil
}
}
return fmt.Errorf("Gateway did not become ready within timeout")
}
// monitorInstance monitors an instance and updates its status
func (is *InstanceSpawner) monitorInstance(instance *GatewayInstance) {
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
for range ticker.C {
is.mu.RLock()
key := instanceKey(instance.Namespace, instance.NodeID)
_, exists := is.instances[key]
is.mu.RUnlock()
if !exists {
// Instance was removed
return
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
healthy, _ := instance.IsHealthy(ctx)
cancel()
is.mu.Lock()
if healthy {
instance.Status = InstanceStatusRunning
instance.LastHealthCheck = time.Now()
} else {
instance.Status = InstanceStatusFailed
instance.logger.Warn("Gateway instance health check failed")
}
is.mu.Unlock()
// Check if process is still running
if instance.cmd != nil && instance.cmd.ProcessState != nil && instance.cmd.ProcessState.Exited() {
is.mu.Lock()
instance.Status = InstanceStatusStopped
is.mu.Unlock()
instance.logger.Warn("Gateway instance process exited unexpectedly")
return
}
}
}
// IsHealthy checks if the Gateway instance is healthy
func (gi *GatewayInstance) IsHealthy(ctx context.Context) (bool, error) {
url := fmt.Sprintf("http://localhost:%d/v1/health", gi.HTTPPort)
client := tlsutil.NewHTTPClient(5 * time.Second)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return false, err
}
resp, err := client.Do(req)
if err != nil {
return false, err
}
defer resp.Body.Close()
return resp.StatusCode == http.StatusOK, nil
}
// DSN returns the local connection address for this Gateway instance
func (gi *GatewayInstance) DSN() string {
return fmt.Sprintf("http://localhost:%d", gi.HTTPPort)
}
// ExternalURL returns the external URL for accessing this namespace's gateway
func (gi *GatewayInstance) ExternalURL() string {
return fmt.Sprintf("https://ns-%s.%s", gi.Namespace, gi.BaseDomain)
}