mirror of
https://github.com/DeBrosOfficial/network.git
synced 2026-01-30 03:43:04 +00:00
namespaces on gateway, load balancer and rqlite and olric namespaces
This commit is contained in:
parent
468ca06398
commit
edd9c1f3dc
4
.gitignore
vendored
4
.gitignore
vendored
@ -92,4 +92,6 @@ orama-cli-linux
|
||||
|
||||
rnd/
|
||||
|
||||
keys_backup/
|
||||
keys_backup/
|
||||
|
||||
vps.txt
|
||||
391
e2e/namespace_cluster_test.go
Normal file
391
e2e/namespace_cluster_test.go
Normal file
@ -0,0 +1,391 @@
|
||||
//go:build e2e
|
||||
|
||||
package e2e
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// TestNamespaceCluster_Provisioning tests that creating a new namespace
|
||||
// triggers cluster provisioning with 202 Accepted response
|
||||
func TestNamespaceCluster_Provisioning(t *testing.T) {
|
||||
if !IsProductionMode() {
|
||||
t.Skip("Namespace cluster provisioning only applies in production mode")
|
||||
}
|
||||
|
||||
// This test requires a completely new namespace to trigger provisioning
|
||||
newNamespace := fmt.Sprintf("test-ns-%d", time.Now().UnixNano())
|
||||
|
||||
env, err := LoadTestEnvWithNamespace(newNamespace)
|
||||
require.NoError(t, err, "Should create test environment")
|
||||
|
||||
t.Run("New namespace triggers provisioning", func(t *testing.T) {
|
||||
// If we got here with an API key, provisioning either completed or was not required
|
||||
// The LoadTestEnvWithNamespace function handles the provisioning flow
|
||||
require.NotEmpty(t, env.APIKey, "Should have received API key after provisioning")
|
||||
t.Logf("Namespace %s provisioned successfully", newNamespace)
|
||||
})
|
||||
|
||||
t.Run("Namespace gateway is accessible", func(t *testing.T) {
|
||||
// Try to access the namespace gateway
|
||||
// The URL should be ns-{namespace}.{baseDomain}
|
||||
cfg, _ := LoadE2EConfig()
|
||||
if cfg.BaseDomain == "" {
|
||||
cfg.BaseDomain = "devnet-orama.network"
|
||||
}
|
||||
|
||||
nsGatewayURL := fmt.Sprintf("https://ns-%s.%s", newNamespace, cfg.BaseDomain)
|
||||
|
||||
req, _ := http.NewRequest("GET", nsGatewayURL+"/v1/health", nil)
|
||||
req.Header.Set("Authorization", "Bearer "+env.APIKey)
|
||||
|
||||
resp, err := env.HTTPClient.Do(req)
|
||||
if err != nil {
|
||||
t.Logf("Note: Namespace gateway not accessible (expected in local mode): %v", err)
|
||||
t.Skip("Namespace gateway endpoint not available")
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
assert.Equal(t, http.StatusOK, resp.StatusCode, "Namespace gateway should be healthy")
|
||||
t.Logf("Namespace gateway %s is accessible", nsGatewayURL)
|
||||
})
|
||||
}
|
||||
|
||||
// TestNamespaceCluster_StatusPolling tests the /v1/namespace/status endpoint
|
||||
func TestNamespaceCluster_StatusPolling(t *testing.T) {
|
||||
env, err := LoadTestEnv()
|
||||
require.NoError(t, err, "Should load test environment")
|
||||
|
||||
t.Run("Status endpoint returns valid response", func(t *testing.T) {
|
||||
// Test with a non-existent cluster ID (should return 404)
|
||||
req, _ := http.NewRequest("GET", env.GatewayURL+"/v1/namespace/status?id=non-existent-id", nil)
|
||||
|
||||
resp, err := env.HTTPClient.Do(req)
|
||||
require.NoError(t, err, "Should execute request")
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Should return 404 for non-existent cluster
|
||||
assert.Equal(t, http.StatusNotFound, resp.StatusCode, "Should return 404 for non-existent cluster")
|
||||
})
|
||||
}
|
||||
|
||||
// TestNamespaceCluster_CrossGatewayAccess tests that API keys from one namespace
|
||||
// cannot access another namespace's dedicated gateway
|
||||
func TestNamespaceCluster_CrossGatewayAccess(t *testing.T) {
|
||||
if !IsProductionMode() {
|
||||
t.Skip("Cross-gateway access control only applies in production mode")
|
||||
}
|
||||
|
||||
// Create two namespaces
|
||||
nsA := fmt.Sprintf("ns-a-%d", time.Now().Unix())
|
||||
nsB := fmt.Sprintf("ns-b-%d", time.Now().Unix())
|
||||
|
||||
envA, err := LoadTestEnvWithNamespace(nsA)
|
||||
require.NoError(t, err, "Should create test environment for namespace A")
|
||||
|
||||
envB, err := LoadTestEnvWithNamespace(nsB)
|
||||
require.NoError(t, err, "Should create test environment for namespace B")
|
||||
|
||||
cfg, _ := LoadE2EConfig()
|
||||
if cfg.BaseDomain == "" {
|
||||
cfg.BaseDomain = "devnet-orama.network"
|
||||
}
|
||||
|
||||
t.Run("Namespace A key cannot access Namespace B gateway", func(t *testing.T) {
|
||||
// Try to use namespace A's key on namespace B's gateway
|
||||
nsBGatewayURL := fmt.Sprintf("https://ns-%s.%s", nsB, cfg.BaseDomain)
|
||||
|
||||
req, _ := http.NewRequest("GET", nsBGatewayURL+"/v1/deployments/list", nil)
|
||||
req.Header.Set("Authorization", "Bearer "+envA.APIKey) // Using A's key
|
||||
|
||||
resp, err := envA.HTTPClient.Do(req)
|
||||
if err != nil {
|
||||
t.Logf("Note: Gateway not accessible: %v", err)
|
||||
t.Skip("Namespace gateway endpoint not available")
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
assert.Equal(t, http.StatusForbidden, resp.StatusCode,
|
||||
"Should deny namespace A's key on namespace B's gateway")
|
||||
t.Logf("Cross-namespace access correctly denied (status: %d)", resp.StatusCode)
|
||||
})
|
||||
|
||||
t.Run("Namespace B key works on Namespace B gateway", func(t *testing.T) {
|
||||
nsBGatewayURL := fmt.Sprintf("https://ns-%s.%s", nsB, cfg.BaseDomain)
|
||||
|
||||
req, _ := http.NewRequest("GET", nsBGatewayURL+"/v1/deployments/list", nil)
|
||||
req.Header.Set("Authorization", "Bearer "+envB.APIKey) // Using B's key
|
||||
|
||||
resp, err := envB.HTTPClient.Do(req)
|
||||
if err != nil {
|
||||
t.Logf("Note: Gateway not accessible: %v", err)
|
||||
t.Skip("Namespace gateway endpoint not available")
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
assert.Equal(t, http.StatusOK, resp.StatusCode,
|
||||
"Should allow namespace B's key on namespace B's gateway")
|
||||
t.Logf("Same-namespace access correctly allowed")
|
||||
})
|
||||
}
|
||||
|
||||
// TestNamespaceCluster_DefaultNamespaceAccessible tests that the default namespace
|
||||
// is accessible by any valid API key
|
||||
func TestNamespaceCluster_DefaultNamespaceAccessible(t *testing.T) {
|
||||
// Create a non-default namespace
|
||||
customNS := fmt.Sprintf("custom-%d", time.Now().Unix())
|
||||
env, err := LoadTestEnvWithNamespace(customNS)
|
||||
require.NoError(t, err, "Should create test environment")
|
||||
|
||||
t.Run("Custom namespace key can access default gateway endpoints", func(t *testing.T) {
|
||||
// The default gateway should accept keys from any namespace
|
||||
req, _ := http.NewRequest("GET", env.GatewayURL+"/v1/health", nil)
|
||||
req.Header.Set("Authorization", "Bearer "+env.APIKey)
|
||||
|
||||
resp, err := env.HTTPClient.Do(req)
|
||||
require.NoError(t, err, "Should execute request")
|
||||
defer resp.Body.Close()
|
||||
|
||||
assert.Equal(t, http.StatusOK, resp.StatusCode,
|
||||
"Default gateway should accept any valid API key")
|
||||
})
|
||||
}
|
||||
|
||||
// TestDeployment_RandomSubdomain tests that deployments get random subdomain suffix
|
||||
func TestDeployment_RandomSubdomain(t *testing.T) {
|
||||
env, err := LoadTestEnv()
|
||||
require.NoError(t, err, "Should load test environment")
|
||||
|
||||
tarballPath := filepath.Join("../testdata/tarballs/react-vite.tar.gz")
|
||||
|
||||
// Create a deployment
|
||||
deploymentName := "subdomain-test"
|
||||
deploymentID := CreateTestDeployment(t, env, deploymentName, tarballPath)
|
||||
defer func() {
|
||||
if !env.SkipCleanup {
|
||||
DeleteDeployment(t, env, deploymentID)
|
||||
}
|
||||
}()
|
||||
|
||||
t.Run("Deployment URL contains random suffix", func(t *testing.T) {
|
||||
// Get deployment details
|
||||
req, _ := http.NewRequest("GET", env.GatewayURL+"/v1/deployments/get?id="+deploymentID, nil)
|
||||
req.Header.Set("Authorization", "Bearer "+env.APIKey)
|
||||
|
||||
resp, err := env.HTTPClient.Do(req)
|
||||
require.NoError(t, err, "Should execute request")
|
||||
defer resp.Body.Close()
|
||||
|
||||
require.Equal(t, http.StatusOK, resp.StatusCode, "Should get deployment")
|
||||
|
||||
var result map[string]interface{}
|
||||
bodyBytes, _ := io.ReadAll(resp.Body)
|
||||
require.NoError(t, json.Unmarshal(bodyBytes, &result), "Should decode JSON")
|
||||
|
||||
deployment, ok := result["deployment"].(map[string]interface{})
|
||||
if !ok {
|
||||
deployment = result
|
||||
}
|
||||
|
||||
// Check subdomain field
|
||||
subdomain, _ := deployment["subdomain"].(string)
|
||||
if subdomain != "" {
|
||||
// Subdomain should follow format: {name}-{random}
|
||||
// e.g., "subdomain-test-f3o4if"
|
||||
assert.True(t, strings.HasPrefix(subdomain, deploymentName+"-"),
|
||||
"Subdomain should start with deployment name followed by dash")
|
||||
|
||||
suffix := strings.TrimPrefix(subdomain, deploymentName+"-")
|
||||
assert.Equal(t, 6, len(suffix), "Random suffix should be 6 characters")
|
||||
|
||||
t.Logf("Deployment subdomain: %s (suffix: %s)", subdomain, suffix)
|
||||
} else {
|
||||
t.Logf("Note: Subdomain field not set (may be using legacy format)")
|
||||
}
|
||||
|
||||
// Check URLs
|
||||
urls, ok := deployment["urls"].([]interface{})
|
||||
if ok && len(urls) > 0 {
|
||||
url := urls[0].(string)
|
||||
t.Logf("Deployment URL: %s", url)
|
||||
|
||||
// URL should contain the subdomain with random suffix
|
||||
if subdomain != "" {
|
||||
assert.Contains(t, url, subdomain, "URL should contain the subdomain")
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// TestDeployment_SubdomainUniqueness tests that two deployments with the same name
|
||||
// get different subdomains
|
||||
func TestDeployment_SubdomainUniqueness(t *testing.T) {
|
||||
envA, err := LoadTestEnvWithNamespace("ns-unique-a-" + fmt.Sprintf("%d", time.Now().Unix()))
|
||||
require.NoError(t, err, "Should create test environment A")
|
||||
|
||||
envB, err := LoadTestEnvWithNamespace("ns-unique-b-" + fmt.Sprintf("%d", time.Now().Unix()))
|
||||
require.NoError(t, err, "Should create test environment B")
|
||||
|
||||
tarballPath := filepath.Join("../testdata/tarballs/react-vite.tar.gz")
|
||||
deploymentName := "same-name-app"
|
||||
|
||||
// Create deployment in namespace A
|
||||
deploymentIDA := CreateTestDeployment(t, envA, deploymentName, tarballPath)
|
||||
defer func() {
|
||||
if !envA.SkipCleanup {
|
||||
DeleteDeployment(t, envA, deploymentIDA)
|
||||
}
|
||||
}()
|
||||
|
||||
// Create deployment with same name in namespace B
|
||||
deploymentIDB := CreateTestDeployment(t, envB, deploymentName, tarballPath)
|
||||
defer func() {
|
||||
if !envB.SkipCleanup {
|
||||
DeleteDeployment(t, envB, deploymentIDB)
|
||||
}
|
||||
}()
|
||||
|
||||
t.Run("Same name deployments have different subdomains", func(t *testing.T) {
|
||||
// Get deployment A details
|
||||
reqA, _ := http.NewRequest("GET", envA.GatewayURL+"/v1/deployments/get?id="+deploymentIDA, nil)
|
||||
reqA.Header.Set("Authorization", "Bearer "+envA.APIKey)
|
||||
respA, _ := envA.HTTPClient.Do(reqA)
|
||||
defer respA.Body.Close()
|
||||
|
||||
var resultA map[string]interface{}
|
||||
bodyBytesA, _ := io.ReadAll(respA.Body)
|
||||
json.Unmarshal(bodyBytesA, &resultA)
|
||||
|
||||
deploymentA, ok := resultA["deployment"].(map[string]interface{})
|
||||
if !ok {
|
||||
deploymentA = resultA
|
||||
}
|
||||
subdomainA, _ := deploymentA["subdomain"].(string)
|
||||
|
||||
// Get deployment B details
|
||||
reqB, _ := http.NewRequest("GET", envB.GatewayURL+"/v1/deployments/get?id="+deploymentIDB, nil)
|
||||
reqB.Header.Set("Authorization", "Bearer "+envB.APIKey)
|
||||
respB, _ := envB.HTTPClient.Do(reqB)
|
||||
defer respB.Body.Close()
|
||||
|
||||
var resultB map[string]interface{}
|
||||
bodyBytesB, _ := io.ReadAll(respB.Body)
|
||||
json.Unmarshal(bodyBytesB, &resultB)
|
||||
|
||||
deploymentB, ok := resultB["deployment"].(map[string]interface{})
|
||||
if !ok {
|
||||
deploymentB = resultB
|
||||
}
|
||||
subdomainB, _ := deploymentB["subdomain"].(string)
|
||||
|
||||
// If subdomains are set, they should be different
|
||||
if subdomainA != "" && subdomainB != "" {
|
||||
assert.NotEqual(t, subdomainA, subdomainB,
|
||||
"Same-name deployments in different namespaces should have different subdomains")
|
||||
|
||||
t.Logf("Namespace A subdomain: %s", subdomainA)
|
||||
t.Logf("Namespace B subdomain: %s", subdomainB)
|
||||
} else {
|
||||
t.Logf("Note: Subdomains not set (may be using legacy format)")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// TestNamespaceCluster_DNSFormat tests the DNS naming convention for namespaces
|
||||
func TestNamespaceCluster_DNSFormat(t *testing.T) {
|
||||
cfg, err := LoadE2EConfig()
|
||||
if err != nil {
|
||||
cfg = DefaultConfig()
|
||||
}
|
||||
|
||||
if cfg.BaseDomain == "" {
|
||||
cfg.BaseDomain = "devnet-orama.network"
|
||||
}
|
||||
|
||||
t.Run("Namespace gateway DNS follows ns-{name}.{baseDomain} format", func(t *testing.T) {
|
||||
namespace := "my-test-namespace"
|
||||
expectedDomain := fmt.Sprintf("ns-%s.%s", namespace, cfg.BaseDomain)
|
||||
|
||||
t.Logf("Expected namespace gateway domain: %s", expectedDomain)
|
||||
|
||||
// Verify format
|
||||
assert.True(t, strings.HasPrefix(expectedDomain, "ns-"),
|
||||
"Namespace gateway domain should start with 'ns-'")
|
||||
assert.True(t, strings.HasSuffix(expectedDomain, cfg.BaseDomain),
|
||||
"Namespace gateway domain should end with base domain")
|
||||
})
|
||||
|
||||
t.Run("Deployment DNS follows {name}-{random}.{baseDomain} format", func(t *testing.T) {
|
||||
deploymentName := "my-app"
|
||||
randomSuffix := "f3o4if"
|
||||
expectedDomain := fmt.Sprintf("%s-%s.%s", deploymentName, randomSuffix, cfg.BaseDomain)
|
||||
|
||||
t.Logf("Expected deployment domain: %s", expectedDomain)
|
||||
|
||||
// Verify format
|
||||
assert.Contains(t, expectedDomain, deploymentName,
|
||||
"Deployment domain should contain the deployment name")
|
||||
assert.True(t, strings.HasSuffix(expectedDomain, cfg.BaseDomain),
|
||||
"Deployment domain should end with base domain")
|
||||
})
|
||||
}
|
||||
|
||||
// TestNamespaceCluster_PortAllocation tests the port allocation constraints
|
||||
func TestNamespaceCluster_PortAllocation(t *testing.T) {
|
||||
t.Run("Port range constants are correct", func(t *testing.T) {
|
||||
// These constants are defined in pkg/namespace/types.go
|
||||
const (
|
||||
portRangeStart = 10000
|
||||
portRangeEnd = 10099
|
||||
portsPerNamespace = 5
|
||||
maxNamespacesPerNode = 20
|
||||
)
|
||||
|
||||
// Verify range calculation
|
||||
totalPorts := portRangeEnd - portRangeStart + 1
|
||||
assert.Equal(t, 100, totalPorts, "Port range should be 100 ports")
|
||||
|
||||
expectedMax := totalPorts / portsPerNamespace
|
||||
assert.Equal(t, maxNamespacesPerNode, expectedMax,
|
||||
"Max namespaces per node should be total ports / ports per namespace")
|
||||
|
||||
t.Logf("Port range: %d-%d (%d ports total)", portRangeStart, portRangeEnd, totalPorts)
|
||||
t.Logf("Ports per namespace: %d", portsPerNamespace)
|
||||
t.Logf("Max namespaces per node: %d", maxNamespacesPerNode)
|
||||
})
|
||||
|
||||
t.Run("Port assignments within a block are sequential", func(t *testing.T) {
|
||||
portStart := 10000
|
||||
|
||||
rqliteHTTP := portStart + 0
|
||||
rqliteRaft := portStart + 1
|
||||
olricHTTP := portStart + 2
|
||||
olricMemberlist := portStart + 3
|
||||
gatewayHTTP := portStart + 4
|
||||
|
||||
// All ports should be unique
|
||||
ports := []int{rqliteHTTP, rqliteRaft, olricHTTP, olricMemberlist, gatewayHTTP}
|
||||
seen := make(map[int]bool)
|
||||
for _, port := range ports {
|
||||
assert.False(t, seen[port], "Ports should be unique within a block")
|
||||
seen[port] = true
|
||||
}
|
||||
|
||||
t.Logf("Port assignments for block starting at %d:", portStart)
|
||||
t.Logf(" RQLite HTTP: %d", rqliteHTTP)
|
||||
t.Logf(" RQLite Raft: %d", rqliteRaft)
|
||||
t.Logf(" Olric HTTP: %d", olricHTTP)
|
||||
t.Logf(" Olric Memberlist: %d", olricMemberlist)
|
||||
t.Logf(" Gateway HTTP: %d", gatewayHTTP)
|
||||
})
|
||||
}
|
||||
190
migrations/010_namespace_clusters.sql
Normal file
190
migrations/010_namespace_clusters.sql
Normal file
@ -0,0 +1,190 @@
|
||||
-- Migration 010: Namespace Clusters for Physical Isolation
|
||||
-- Creates tables to manage per-namespace RQLite and Olric clusters
|
||||
-- Each namespace gets its own 3-node cluster for complete isolation
|
||||
|
||||
BEGIN;
|
||||
|
||||
-- Extend namespaces table with cluster status tracking
|
||||
-- Note: SQLite doesn't support ADD COLUMN IF NOT EXISTS, so we handle this carefully
|
||||
-- These columns track the provisioning state of the namespace's dedicated cluster
|
||||
|
||||
-- First check if columns exist, if not add them
|
||||
-- cluster_status: 'none', 'provisioning', 'ready', 'degraded', 'failed', 'deprovisioning'
|
||||
|
||||
-- Create a new namespaces table with additional columns if needed
|
||||
CREATE TABLE IF NOT EXISTS namespaces_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name TEXT NOT NULL UNIQUE,
|
||||
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
cluster_status TEXT DEFAULT 'none',
|
||||
cluster_created_at TIMESTAMP,
|
||||
cluster_ready_at TIMESTAMP
|
||||
);
|
||||
|
||||
-- Copy data from old table if it exists and new columns don't
|
||||
INSERT OR IGNORE INTO namespaces_new (id, name, created_at, cluster_status)
|
||||
SELECT id, name, created_at, 'none' FROM namespaces WHERE NOT EXISTS (
|
||||
SELECT 1 FROM pragma_table_info('namespaces') WHERE name = 'cluster_status'
|
||||
);
|
||||
|
||||
-- If the column already exists, this migration was partially applied - skip the table swap
|
||||
-- We'll use a different approach: just ensure the new tables exist
|
||||
|
||||
-- Namespace clusters registry
|
||||
-- One record per namespace that has a dedicated cluster
|
||||
CREATE TABLE IF NOT EXISTS namespace_clusters (
|
||||
id TEXT PRIMARY KEY, -- UUID
|
||||
namespace_id INTEGER NOT NULL UNIQUE, -- FK to namespaces
|
||||
namespace_name TEXT NOT NULL, -- Cached for easier lookups
|
||||
status TEXT NOT NULL DEFAULT 'provisioning', -- provisioning, ready, degraded, failed, deprovisioning
|
||||
|
||||
-- Cluster configuration
|
||||
rqlite_node_count INTEGER NOT NULL DEFAULT 3,
|
||||
olric_node_count INTEGER NOT NULL DEFAULT 3,
|
||||
gateway_node_count INTEGER NOT NULL DEFAULT 3,
|
||||
|
||||
-- Provisioning metadata
|
||||
provisioned_by TEXT NOT NULL, -- Wallet address that triggered provisioning
|
||||
provisioned_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
ready_at TIMESTAMP,
|
||||
last_health_check TIMESTAMP,
|
||||
|
||||
-- Error tracking
|
||||
error_message TEXT,
|
||||
retry_count INTEGER DEFAULT 0,
|
||||
|
||||
FOREIGN KEY (namespace_id) REFERENCES namespaces(id) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_namespace_clusters_status ON namespace_clusters(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_namespace_clusters_namespace ON namespace_clusters(namespace_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_namespace_clusters_name ON namespace_clusters(namespace_name);
|
||||
|
||||
-- Namespace cluster nodes
|
||||
-- Tracks which physical nodes host services for each namespace cluster
|
||||
CREATE TABLE IF NOT EXISTS namespace_cluster_nodes (
|
||||
id TEXT PRIMARY KEY, -- UUID
|
||||
namespace_cluster_id TEXT NOT NULL, -- FK to namespace_clusters
|
||||
node_id TEXT NOT NULL, -- FK to dns_nodes (physical node)
|
||||
|
||||
-- Role in the cluster
|
||||
-- Each node can have multiple roles (rqlite + olric + gateway)
|
||||
role TEXT NOT NULL, -- 'rqlite_leader', 'rqlite_follower', 'olric', 'gateway'
|
||||
|
||||
-- Service ports (allocated from reserved range 10000-10099)
|
||||
rqlite_http_port INTEGER, -- Port for RQLite HTTP API
|
||||
rqlite_raft_port INTEGER, -- Port for RQLite Raft consensus
|
||||
olric_http_port INTEGER, -- Port for Olric HTTP API
|
||||
olric_memberlist_port INTEGER, -- Port for Olric memberlist gossip
|
||||
gateway_http_port INTEGER, -- Port for Gateway HTTP
|
||||
|
||||
-- Service status
|
||||
status TEXT NOT NULL DEFAULT 'pending', -- pending, starting, running, stopped, failed
|
||||
process_pid INTEGER, -- PID of running process (for local management)
|
||||
last_heartbeat TIMESTAMP,
|
||||
error_message TEXT,
|
||||
|
||||
-- Join addresses for cluster formation
|
||||
rqlite_join_address TEXT, -- Address to join RQLite cluster
|
||||
olric_peers TEXT, -- JSON array of Olric peer addresses
|
||||
|
||||
-- Metadata
|
||||
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
UNIQUE(namespace_cluster_id, node_id, role),
|
||||
FOREIGN KEY (namespace_cluster_id) REFERENCES namespace_clusters(id) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_cluster_nodes_cluster ON namespace_cluster_nodes(namespace_cluster_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_cluster_nodes_node ON namespace_cluster_nodes(node_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_cluster_nodes_status ON namespace_cluster_nodes(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_cluster_nodes_role ON namespace_cluster_nodes(role);
|
||||
|
||||
-- Namespace port allocations
|
||||
-- Manages the reserved port range (10000-10099) for namespace services
|
||||
-- Each namespace instance on a node gets a block of 5 consecutive ports
|
||||
CREATE TABLE IF NOT EXISTS namespace_port_allocations (
|
||||
id TEXT PRIMARY KEY, -- UUID
|
||||
node_id TEXT NOT NULL, -- Physical node ID
|
||||
namespace_cluster_id TEXT NOT NULL, -- Namespace cluster this allocation belongs to
|
||||
|
||||
-- Port block (5 consecutive ports)
|
||||
port_start INTEGER NOT NULL, -- Start of port block (e.g., 10000)
|
||||
port_end INTEGER NOT NULL, -- End of port block (e.g., 10004)
|
||||
|
||||
-- Individual port assignments within the block
|
||||
rqlite_http_port INTEGER NOT NULL, -- port_start + 0
|
||||
rqlite_raft_port INTEGER NOT NULL, -- port_start + 1
|
||||
olric_http_port INTEGER NOT NULL, -- port_start + 2
|
||||
olric_memberlist_port INTEGER NOT NULL, -- port_start + 3
|
||||
gateway_http_port INTEGER NOT NULL, -- port_start + 4
|
||||
|
||||
allocated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
-- Prevent overlapping allocations on same node
|
||||
UNIQUE(node_id, port_start),
|
||||
-- One allocation per namespace per node
|
||||
UNIQUE(namespace_cluster_id, node_id),
|
||||
FOREIGN KEY (namespace_cluster_id) REFERENCES namespace_clusters(id) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_ns_port_alloc_node ON namespace_port_allocations(node_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_ns_port_alloc_cluster ON namespace_port_allocations(namespace_cluster_id);
|
||||
|
||||
-- Namespace cluster events
|
||||
-- Audit log for cluster provisioning and lifecycle events
|
||||
CREATE TABLE IF NOT EXISTS namespace_cluster_events (
|
||||
id TEXT PRIMARY KEY, -- UUID
|
||||
namespace_cluster_id TEXT NOT NULL,
|
||||
event_type TEXT NOT NULL, -- Event types listed below
|
||||
node_id TEXT, -- Optional: specific node this event relates to
|
||||
message TEXT,
|
||||
metadata TEXT, -- JSON for additional event data
|
||||
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
FOREIGN KEY (namespace_cluster_id) REFERENCES namespace_clusters(id) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
-- Event types:
|
||||
-- 'provisioning_started' - Cluster provisioning began
|
||||
-- 'nodes_selected' - 3 nodes were selected for the cluster
|
||||
-- 'ports_allocated' - Ports allocated on a node
|
||||
-- 'rqlite_started' - RQLite instance started on a node
|
||||
-- 'rqlite_joined' - RQLite instance joined the cluster
|
||||
-- 'rqlite_leader_elected' - RQLite leader election completed
|
||||
-- 'olric_started' - Olric instance started on a node
|
||||
-- 'olric_joined' - Olric instance joined memberlist
|
||||
-- 'gateway_started' - Gateway instance started on a node
|
||||
-- 'dns_created' - DNS records created for namespace
|
||||
-- 'cluster_ready' - All services ready, cluster is operational
|
||||
-- 'cluster_degraded' - One or more nodes are unhealthy
|
||||
-- 'cluster_failed' - Cluster failed to provision or operate
|
||||
-- 'node_failed' - Specific node became unhealthy
|
||||
-- 'node_recovered' - Node recovered from failure
|
||||
-- 'deprovisioning_started' - Cluster deprovisioning began
|
||||
-- 'deprovisioned' - Cluster fully deprovisioned
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_cluster_events_cluster ON namespace_cluster_events(namespace_cluster_id, created_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_cluster_events_type ON namespace_cluster_events(event_type);
|
||||
|
||||
-- Global deployment registry
|
||||
-- Prevents duplicate deployment subdomains across all namespaces
|
||||
-- Since deployments now use {name}-{random}.{domain}, we track used subdomains globally
|
||||
CREATE TABLE IF NOT EXISTS global_deployment_subdomains (
|
||||
subdomain TEXT PRIMARY KEY, -- Full subdomain (e.g., 'myapp-f3o4if')
|
||||
namespace TEXT NOT NULL, -- Owner namespace
|
||||
deployment_id TEXT NOT NULL, -- FK to deployments (in namespace cluster)
|
||||
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
-- No FK to deployments since deployments are in namespace-specific clusters
|
||||
UNIQUE(subdomain)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_global_subdomains_namespace ON global_deployment_subdomains(namespace);
|
||||
CREATE INDEX IF NOT EXISTS idx_global_subdomains_deployment ON global_deployment_subdomains(deployment_id);
|
||||
|
||||
-- Mark migration as applied
|
||||
INSERT OR IGNORE INTO schema_migrations(version) VALUES (10);
|
||||
|
||||
COMMIT;
|
||||
@ -85,6 +85,7 @@ func PerformSimpleAuthentication(gatewayURL string) (*Credentials, error) {
|
||||
}
|
||||
|
||||
// requestAPIKeyFromGateway calls the gateway's simple-key endpoint to generate an API key
|
||||
// For non-default namespaces, this may trigger cluster provisioning and require polling
|
||||
func requestAPIKeyFromGateway(gatewayURL, wallet, namespace string) (string, error) {
|
||||
reqBody := map[string]string{
|
||||
"wallet": wallet,
|
||||
@ -109,6 +110,170 @@ func requestAPIKeyFromGateway(gatewayURL, wallet, namespace string) (string, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Handle 202 Accepted - namespace cluster is being provisioned
|
||||
if resp.StatusCode == http.StatusAccepted {
|
||||
return handleProvisioningResponse(gatewayURL, client, resp, wallet, namespace)
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return "", fmt.Errorf("gateway returned status %d: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
var respBody map[string]interface{}
|
||||
if err := json.NewDecoder(resp.Body).Decode(&respBody); err != nil {
|
||||
return "", fmt.Errorf("failed to decode response: %w", err)
|
||||
}
|
||||
|
||||
apiKey, ok := respBody["api_key"].(string)
|
||||
if !ok || apiKey == "" {
|
||||
return "", fmt.Errorf("no api_key in response")
|
||||
}
|
||||
|
||||
return apiKey, nil
|
||||
}
|
||||
|
||||
// handleProvisioningResponse handles 202 Accepted responses when namespace cluster provisioning is needed
|
||||
func handleProvisioningResponse(gatewayURL string, client *http.Client, resp *http.Response, wallet, namespace string) (string, error) {
|
||||
var provResp map[string]interface{}
|
||||
if err := json.NewDecoder(resp.Body).Decode(&provResp); err != nil {
|
||||
return "", fmt.Errorf("failed to decode provisioning response: %w", err)
|
||||
}
|
||||
|
||||
status, _ := provResp["status"].(string)
|
||||
pollURL, _ := provResp["poll_url"].(string)
|
||||
clusterID, _ := provResp["cluster_id"].(string)
|
||||
message, _ := provResp["message"].(string)
|
||||
|
||||
if status != "provisioning" {
|
||||
return "", fmt.Errorf("unexpected status: %s", status)
|
||||
}
|
||||
|
||||
fmt.Printf("\n🏗️ Provisioning namespace cluster...\n")
|
||||
if message != "" {
|
||||
fmt.Printf(" %s\n", message)
|
||||
}
|
||||
if clusterID != "" {
|
||||
fmt.Printf(" Cluster ID: %s\n", clusterID)
|
||||
}
|
||||
fmt.Println()
|
||||
|
||||
// Poll until cluster is ready
|
||||
if err := pollProvisioningStatus(gatewayURL, client, pollURL); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// Cluster is ready, retry the API key request
|
||||
fmt.Println("\n✅ Namespace cluster ready!")
|
||||
fmt.Println("⏳ Retrieving API key...")
|
||||
|
||||
return retryAPIKeyRequest(gatewayURL, client, wallet, namespace)
|
||||
}
|
||||
|
||||
// pollProvisioningStatus polls the status endpoint until the cluster is ready
|
||||
func pollProvisioningStatus(gatewayURL string, client *http.Client, pollURL string) error {
|
||||
// Build full poll URL if it's a relative path
|
||||
if strings.HasPrefix(pollURL, "/") {
|
||||
pollURL = gatewayURL + pollURL
|
||||
}
|
||||
|
||||
maxAttempts := 120 // 10 minutes (5 seconds per poll)
|
||||
pollInterval := 5 * time.Second
|
||||
|
||||
spinnerChars := []string{"⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"}
|
||||
spinnerIdx := 0
|
||||
|
||||
for i := 0; i < maxAttempts; i++ {
|
||||
// Show progress spinner
|
||||
fmt.Printf("\r%s Waiting for cluster... ", spinnerChars[spinnerIdx%len(spinnerChars)])
|
||||
spinnerIdx++
|
||||
|
||||
resp, err := client.Get(pollURL)
|
||||
if err != nil {
|
||||
time.Sleep(pollInterval)
|
||||
continue
|
||||
}
|
||||
|
||||
var statusResp map[string]interface{}
|
||||
if err := json.NewDecoder(resp.Body).Decode(&statusResp); err != nil {
|
||||
resp.Body.Close()
|
||||
time.Sleep(pollInterval)
|
||||
continue
|
||||
}
|
||||
resp.Body.Close()
|
||||
|
||||
status, _ := statusResp["status"].(string)
|
||||
|
||||
switch status {
|
||||
case "ready":
|
||||
fmt.Printf("\r✅ Cluster ready! \n")
|
||||
return nil
|
||||
|
||||
case "failed":
|
||||
errMsg, _ := statusResp["error"].(string)
|
||||
fmt.Printf("\r❌ Provisioning failed \n")
|
||||
return fmt.Errorf("cluster provisioning failed: %s", errMsg)
|
||||
|
||||
case "provisioning":
|
||||
// Show progress details
|
||||
rqliteReady, _ := statusResp["rqlite_ready"].(bool)
|
||||
olricReady, _ := statusResp["olric_ready"].(bool)
|
||||
gatewayReady, _ := statusResp["gateway_ready"].(bool)
|
||||
dnsReady, _ := statusResp["dns_ready"].(bool)
|
||||
|
||||
progressStr := ""
|
||||
if rqliteReady {
|
||||
progressStr += "RQLite✓ "
|
||||
}
|
||||
if olricReady {
|
||||
progressStr += "Olric✓ "
|
||||
}
|
||||
if gatewayReady {
|
||||
progressStr += "Gateway✓ "
|
||||
}
|
||||
if dnsReady {
|
||||
progressStr += "DNS✓"
|
||||
}
|
||||
if progressStr != "" {
|
||||
fmt.Printf("\r%s Provisioning... [%s]", spinnerChars[spinnerIdx%len(spinnerChars)], progressStr)
|
||||
}
|
||||
|
||||
default:
|
||||
// Unknown status, continue polling
|
||||
}
|
||||
|
||||
time.Sleep(pollInterval)
|
||||
}
|
||||
|
||||
fmt.Printf("\r⚠️ Timeout waiting for cluster \n")
|
||||
return fmt.Errorf("timeout waiting for namespace cluster provisioning")
|
||||
}
|
||||
|
||||
// retryAPIKeyRequest retries the API key request after cluster provisioning
|
||||
func retryAPIKeyRequest(gatewayURL string, client *http.Client, wallet, namespace string) (string, error) {
|
||||
reqBody := map[string]string{
|
||||
"wallet": wallet,
|
||||
"namespace": namespace,
|
||||
}
|
||||
|
||||
payload, err := json.Marshal(reqBody)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to marshal request: %w", err)
|
||||
}
|
||||
|
||||
endpoint := gatewayURL + "/v1/auth/simple-key"
|
||||
|
||||
resp, err := client.Post(endpoint, "application/json", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to call gateway: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode == http.StatusAccepted {
|
||||
// Still provisioning? This shouldn't happen but handle gracefully
|
||||
return "", fmt.Errorf("cluster still provisioning, please try again")
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return "", fmt.Errorf("gateway returned status %d: %s", resp.StatusCode, string(body))
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package install
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
@ -25,6 +26,11 @@ func NewOrchestrator(flags *Flags) (*Orchestrator, error) {
|
||||
oramaHome := "/home/debros"
|
||||
oramaDir := oramaHome + "/.orama"
|
||||
|
||||
// Prompt for base domain if not provided via flag
|
||||
if flags.BaseDomain == "" {
|
||||
flags.BaseDomain = promptForBaseDomain()
|
||||
}
|
||||
|
||||
// Normalize peers
|
||||
peers, err := utils.NormalizePeers(flags.PeersStr)
|
||||
if err != nil {
|
||||
@ -227,3 +233,52 @@ func (o *Orchestrator) printFirstNodeSecrets() {
|
||||
fmt.Printf(" Node Peer ID:\n")
|
||||
fmt.Printf(" %s\n\n", o.setup.NodePeerID)
|
||||
}
|
||||
|
||||
// promptForBaseDomain interactively prompts the user to select a network environment
|
||||
// Returns the selected base domain for deployment routing
|
||||
func promptForBaseDomain() string {
|
||||
reader := bufio.NewReader(os.Stdin)
|
||||
|
||||
fmt.Println("\n🌐 Network Environment Selection")
|
||||
fmt.Println("=================================")
|
||||
fmt.Println("Select the network environment for this node:")
|
||||
fmt.Println()
|
||||
fmt.Println(" 1. devnet-orama.network (Development - for testing)")
|
||||
fmt.Println(" 2. testnet-orama.network (Testnet - pre-production)")
|
||||
fmt.Println(" 3. mainnet-orama.network (Mainnet - production)")
|
||||
fmt.Println(" 4. Custom domain...")
|
||||
fmt.Println()
|
||||
fmt.Print("Select option [1-4] (default: 1): ")
|
||||
|
||||
choice, _ := reader.ReadString('\n')
|
||||
choice = strings.TrimSpace(choice)
|
||||
|
||||
switch choice {
|
||||
case "", "1":
|
||||
fmt.Println("✓ Selected: devnet-orama.network")
|
||||
return "devnet-orama.network"
|
||||
case "2":
|
||||
fmt.Println("✓ Selected: testnet-orama.network")
|
||||
return "testnet-orama.network"
|
||||
case "3":
|
||||
fmt.Println("✓ Selected: mainnet-orama.network")
|
||||
return "mainnet-orama.network"
|
||||
case "4":
|
||||
fmt.Print("Enter custom base domain (e.g., example.com): ")
|
||||
customDomain, _ := reader.ReadString('\n')
|
||||
customDomain = strings.TrimSpace(customDomain)
|
||||
if customDomain == "" {
|
||||
fmt.Println("⚠️ No domain entered, using devnet-orama.network")
|
||||
return "devnet-orama.network"
|
||||
}
|
||||
// Remove any protocol prefix if user included it
|
||||
customDomain = strings.TrimPrefix(customDomain, "https://")
|
||||
customDomain = strings.TrimPrefix(customDomain, "http://")
|
||||
customDomain = strings.TrimSuffix(customDomain, "/")
|
||||
fmt.Printf("✓ Selected: %s\n", customDomain)
|
||||
return customDomain
|
||||
default:
|
||||
fmt.Println("⚠️ Invalid option, using devnet-orama.network")
|
||||
return "devnet-orama.network"
|
||||
}
|
||||
}
|
||||
|
||||
@ -9,10 +9,12 @@ import (
|
||||
|
||||
// IssueAPIKeyHandler issues an API key after signature verification.
|
||||
// Similar to VerifyHandler but only returns the API key without JWT tokens.
|
||||
// For non-default namespaces, may trigger cluster provisioning and return 202 Accepted.
|
||||
//
|
||||
// POST /v1/auth/api-key
|
||||
// Request body: APIKeyRequest
|
||||
// Response: { "api_key", "namespace", "plan", "wallet" }
|
||||
// Or 202 Accepted: { "status": "provisioning", "cluster_id", "poll_url" }
|
||||
func (h *Handlers) IssueAPIKeyHandler(w http.ResponseWriter, r *http.Request) {
|
||||
if h.authService == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "auth service not initialized")
|
||||
@ -44,6 +46,56 @@ func (h *Handlers) IssueAPIKeyHandler(w http.ResponseWriter, r *http.Request) {
|
||||
nsID, _ := h.resolveNamespace(ctx, req.Namespace)
|
||||
h.markNonceUsed(ctx, nsID, strings.ToLower(req.Wallet), req.Nonce)
|
||||
|
||||
// Check if namespace cluster provisioning is needed (for non-default namespaces)
|
||||
namespace := strings.TrimSpace(req.Namespace)
|
||||
if namespace == "" {
|
||||
namespace = "default"
|
||||
}
|
||||
|
||||
if h.clusterProvisioner != nil && namespace != "default" {
|
||||
clusterID, status, needsProvisioning, err := h.clusterProvisioner.CheckNamespaceCluster(ctx, namespace)
|
||||
if err != nil {
|
||||
// Log but don't fail - cluster provisioning is optional (error may just mean no cluster yet)
|
||||
_ = err
|
||||
} else if needsProvisioning {
|
||||
// Trigger provisioning for new namespace
|
||||
nsIDInt := 0
|
||||
if id, ok := nsID.(int); ok {
|
||||
nsIDInt = id
|
||||
} else if id, ok := nsID.(int64); ok {
|
||||
nsIDInt = int(id)
|
||||
} else if id, ok := nsID.(float64); ok {
|
||||
nsIDInt = int(id)
|
||||
}
|
||||
|
||||
newClusterID, pollURL, provErr := h.clusterProvisioner.ProvisionNamespaceCluster(ctx, nsIDInt, namespace, req.Wallet)
|
||||
if provErr != nil {
|
||||
writeError(w, http.StatusInternalServerError, "failed to start cluster provisioning")
|
||||
return
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusAccepted, map[string]any{
|
||||
"status": "provisioning",
|
||||
"cluster_id": newClusterID,
|
||||
"poll_url": pollURL,
|
||||
"estimated_time_seconds": 60,
|
||||
"message": "Namespace cluster is being provisioned. Poll the status URL for updates.",
|
||||
})
|
||||
return
|
||||
} else if status == "provisioning" {
|
||||
// Already provisioning, return poll URL
|
||||
writeJSON(w, http.StatusAccepted, map[string]any{
|
||||
"status": "provisioning",
|
||||
"cluster_id": clusterID,
|
||||
"poll_url": "/v1/namespace/status?id=" + clusterID,
|
||||
"estimated_time_seconds": 60,
|
||||
"message": "Namespace cluster is being provisioned. Poll the status URL for updates.",
|
||||
})
|
||||
return
|
||||
}
|
||||
// If status is "ready" or "default", proceed with API key generation
|
||||
}
|
||||
|
||||
apiKey, err := h.authService.GetOrCreateAPIKey(ctx, req.Wallet, req.Namespace)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
|
||||
@ -35,13 +35,24 @@ type QueryResult struct {
|
||||
Rows []interface{} `json:"rows"`
|
||||
}
|
||||
|
||||
// ClusterProvisioner defines the interface for namespace cluster provisioning
|
||||
type ClusterProvisioner interface {
|
||||
// CheckNamespaceCluster checks if a namespace has a cluster and returns its status
|
||||
// Returns: (clusterID, status, needsProvisioning, error)
|
||||
CheckNamespaceCluster(ctx context.Context, namespaceName string) (string, string, bool, error)
|
||||
// ProvisionNamespaceCluster triggers provisioning for a new namespace
|
||||
// Returns: (clusterID, pollURL, error)
|
||||
ProvisionNamespaceCluster(ctx context.Context, namespaceID int, namespaceName, wallet string) (string, string, error)
|
||||
}
|
||||
|
||||
// Handlers holds dependencies for authentication HTTP handlers
|
||||
type Handlers struct {
|
||||
logger *logging.ColoredLogger
|
||||
authService *authsvc.Service
|
||||
netClient NetworkClient
|
||||
defaultNS string
|
||||
internalAuthFn func(context.Context) context.Context
|
||||
logger *logging.ColoredLogger
|
||||
authService *authsvc.Service
|
||||
netClient NetworkClient
|
||||
defaultNS string
|
||||
internalAuthFn func(context.Context) context.Context
|
||||
clusterProvisioner ClusterProvisioner // Optional: for namespace cluster provisioning
|
||||
}
|
||||
|
||||
// NewHandlers creates a new authentication handlers instance
|
||||
@ -61,6 +72,11 @@ func NewHandlers(
|
||||
}
|
||||
}
|
||||
|
||||
// SetClusterProvisioner sets the cluster provisioner for namespace cluster management
|
||||
func (h *Handlers) SetClusterProvisioner(cp ClusterProvisioner) {
|
||||
h.clusterProvisioner = cp
|
||||
}
|
||||
|
||||
// markNonceUsed marks a nonce as used in the database
|
||||
func (h *Handlers) markNonceUsed(ctx context.Context, namespaceID interface{}, wallet, nonce string) {
|
||||
if h.netClient == nil {
|
||||
|
||||
@ -2,8 +2,10 @@ package deployments
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/deployments"
|
||||
@ -12,6 +14,13 @@ import (
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
const (
|
||||
// subdomainSuffixLength is the length of the random suffix for deployment subdomains
|
||||
subdomainSuffixLength = 6
|
||||
// subdomainSuffixChars are the allowed characters for the random suffix (lowercase alphanumeric)
|
||||
subdomainSuffixChars = "abcdefghijklmnopqrstuvwxyz0123456789"
|
||||
)
|
||||
|
||||
// DeploymentService manages deployment operations
|
||||
type DeploymentService struct {
|
||||
db rqlite.Client
|
||||
@ -74,6 +83,87 @@ func GetShortNodeID(peerID string) string {
|
||||
return "node-" + peerID[:6]
|
||||
}
|
||||
|
||||
// generateRandomSuffix generates a random alphanumeric suffix for subdomains
|
||||
func generateRandomSuffix(length int) string {
|
||||
b := make([]byte, length)
|
||||
if _, err := rand.Read(b); err != nil {
|
||||
// Fallback to timestamp-based if crypto/rand fails
|
||||
return fmt.Sprintf("%06x", time.Now().UnixNano()%0xffffff)
|
||||
}
|
||||
for i := range b {
|
||||
b[i] = subdomainSuffixChars[int(b[i])%len(subdomainSuffixChars)]
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
|
||||
// generateSubdomain generates a unique subdomain for a deployment
|
||||
// Format: {name}-{random} (e.g., "myapp-f3o4if")
|
||||
func (s *DeploymentService) generateSubdomain(ctx context.Context, name, namespace, deploymentID string) (string, error) {
|
||||
// Sanitize name for subdomain (lowercase, alphanumeric and hyphens only)
|
||||
sanitizedName := strings.ToLower(name)
|
||||
sanitizedName = strings.Map(func(r rune) rune {
|
||||
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' {
|
||||
return r
|
||||
}
|
||||
return '-'
|
||||
}, sanitizedName)
|
||||
// Remove consecutive hyphens and trim
|
||||
for strings.Contains(sanitizedName, "--") {
|
||||
sanitizedName = strings.ReplaceAll(sanitizedName, "--", "-")
|
||||
}
|
||||
sanitizedName = strings.Trim(sanitizedName, "-")
|
||||
|
||||
// Try to generate a unique subdomain (max 10 attempts)
|
||||
for i := 0; i < 10; i++ {
|
||||
suffix := generateRandomSuffix(subdomainSuffixLength)
|
||||
subdomain := fmt.Sprintf("%s-%s", sanitizedName, suffix)
|
||||
|
||||
// Check if subdomain is already taken globally
|
||||
exists, err := s.subdomainExists(ctx, subdomain)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to check subdomain: %w", err)
|
||||
}
|
||||
if !exists {
|
||||
// Register the subdomain globally
|
||||
if err := s.registerSubdomain(ctx, subdomain, namespace, deploymentID); err != nil {
|
||||
// If registration fails (race condition), try again
|
||||
s.logger.Warn("Failed to register subdomain, retrying",
|
||||
zap.String("subdomain", subdomain),
|
||||
zap.Error(err),
|
||||
)
|
||||
continue
|
||||
}
|
||||
return subdomain, nil
|
||||
}
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("failed to generate unique subdomain after 10 attempts")
|
||||
}
|
||||
|
||||
// subdomainExists checks if a subdomain is already registered globally
|
||||
func (s *DeploymentService) subdomainExists(ctx context.Context, subdomain string) (bool, error) {
|
||||
type existsRow struct {
|
||||
Exists int `db:"exists"`
|
||||
}
|
||||
var rows []existsRow
|
||||
query := `SELECT 1 as exists FROM global_deployment_subdomains WHERE subdomain = ? LIMIT 1`
|
||||
err := s.db.Query(ctx, &rows, query, subdomain)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return len(rows) > 0, nil
|
||||
}
|
||||
|
||||
// registerSubdomain registers a subdomain in the global registry
|
||||
func (s *DeploymentService) registerSubdomain(ctx context.Context, subdomain, namespace, deploymentID string) error {
|
||||
query := `
|
||||
INSERT INTO global_deployment_subdomains (subdomain, namespace, deployment_id, created_at)
|
||||
VALUES (?, ?, ?, ?)
|
||||
`
|
||||
_, err := s.db.Exec(ctx, query, subdomain, namespace, deploymentID, time.Now())
|
||||
return err
|
||||
}
|
||||
|
||||
// CreateDeployment creates a new deployment
|
||||
func (s *DeploymentService) CreateDeployment(ctx context.Context, deployment *deployments.Deployment) error {
|
||||
// Always use current node's peer ID for home node
|
||||
@ -90,6 +180,16 @@ func (s *DeploymentService) CreateDeployment(ctx context.Context, deployment *de
|
||||
deployment.HomeNodeID = homeNodeID
|
||||
}
|
||||
|
||||
// Generate unique subdomain with random suffix if not already set
|
||||
// Format: {name}-{random} (e.g., "myapp-f3o4if")
|
||||
if deployment.Subdomain == "" {
|
||||
subdomain, err := s.generateSubdomain(ctx, deployment.Name, deployment.Namespace, deployment.ID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to generate subdomain: %w", err)
|
||||
}
|
||||
deployment.Subdomain = subdomain
|
||||
}
|
||||
|
||||
// Allocate port for dynamic deployments
|
||||
if deployment.Type != deployments.DeploymentTypeStatic && deployment.Type != deployments.DeploymentTypeNextJSStatic {
|
||||
port, err := s.portAllocator.AllocatePort(ctx, deployment.HomeNodeID, deployment.ID)
|
||||
@ -307,13 +407,24 @@ func (s *DeploymentService) CreateDNSRecords(ctx context.Context, deployment *de
|
||||
return err
|
||||
}
|
||||
|
||||
// Create deployment record: {name}.{baseDomain}
|
||||
// Use subdomain if set, otherwise fall back to name
|
||||
// New format: {name}-{random}.{baseDomain} (e.g., myapp-f3o4if.dbrs.space)
|
||||
dnsName := deployment.Subdomain
|
||||
if dnsName == "" {
|
||||
dnsName = deployment.Name
|
||||
}
|
||||
|
||||
// Create deployment record: {subdomain}.{baseDomain}
|
||||
// Any node can receive the request and proxy to the home node if needed
|
||||
fqdn := fmt.Sprintf("%s.%s.", deployment.Name, s.BaseDomain())
|
||||
fqdn := fmt.Sprintf("%s.%s.", dnsName, s.BaseDomain())
|
||||
if err := s.createDNSRecord(ctx, fqdn, "A", nodeIP, deployment.Namespace, deployment.ID); err != nil {
|
||||
s.logger.Error("Failed to create DNS record", zap.Error(err))
|
||||
} else {
|
||||
s.logger.Info("Created DNS record", zap.String("fqdn", fqdn), zap.String("ip", nodeIP))
|
||||
s.logger.Info("Created DNS record",
|
||||
zap.String("fqdn", fqdn),
|
||||
zap.String("ip", nodeIP),
|
||||
zap.String("subdomain", dnsName),
|
||||
)
|
||||
}
|
||||
|
||||
return nil
|
||||
@ -373,9 +484,14 @@ func (s *DeploymentService) getNodeIP(ctx context.Context, nodeID string) (strin
|
||||
|
||||
// BuildDeploymentURLs builds all URLs for a deployment
|
||||
func (s *DeploymentService) BuildDeploymentURLs(deployment *deployments.Deployment) []string {
|
||||
// Simple URL format: {name}.{baseDomain}
|
||||
// Use subdomain if set, otherwise fall back to name
|
||||
// New format: {name}-{random}.{baseDomain} (e.g., myapp-f3o4if.dbrs.space)
|
||||
dnsName := deployment.Subdomain
|
||||
if dnsName == "" {
|
||||
dnsName = deployment.Name
|
||||
}
|
||||
return []string{
|
||||
fmt.Sprintf("https://%s.%s", deployment.Name, s.BaseDomain()),
|
||||
fmt.Sprintf("https://%s.%s", dnsName, s.BaseDomain()),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
206
pkg/gateway/handlers/namespace/status_handler.go
Normal file
206
pkg/gateway/handlers/namespace/status_handler.go
Normal file
@ -0,0 +1,206 @@
|
||||
// Package namespace provides HTTP handlers for namespace cluster operations
|
||||
package namespace
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/logging"
|
||||
ns "github.com/DeBrosOfficial/network/pkg/namespace"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// StatusHandler handles namespace cluster status requests
|
||||
type StatusHandler struct {
|
||||
clusterManager *ns.ClusterManager
|
||||
logger *zap.Logger
|
||||
}
|
||||
|
||||
// NewStatusHandler creates a new namespace status handler
|
||||
func NewStatusHandler(clusterManager *ns.ClusterManager, logger *logging.ColoredLogger) *StatusHandler {
|
||||
return &StatusHandler{
|
||||
clusterManager: clusterManager,
|
||||
logger: logger.Logger.With(zap.String("handler", "namespace-status")),
|
||||
}
|
||||
}
|
||||
|
||||
// StatusResponse represents the response for /v1/namespace/status
|
||||
type StatusResponse struct {
|
||||
ClusterID string `json:"cluster_id"`
|
||||
Namespace string `json:"namespace"`
|
||||
Status string `json:"status"`
|
||||
Nodes []string `json:"nodes"`
|
||||
RQLiteReady bool `json:"rqlite_ready"`
|
||||
OlricReady bool `json:"olric_ready"`
|
||||
GatewayReady bool `json:"gateway_ready"`
|
||||
DNSReady bool `json:"dns_ready"`
|
||||
Error string `json:"error,omitempty"`
|
||||
GatewayURL string `json:"gateway_url,omitempty"`
|
||||
}
|
||||
|
||||
// Handle handles GET /v1/namespace/status?id={cluster_id}
|
||||
func (h *StatusHandler) Handle(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodGet {
|
||||
writeError(w, http.StatusMethodNotAllowed, "method not allowed")
|
||||
return
|
||||
}
|
||||
|
||||
clusterID := r.URL.Query().Get("id")
|
||||
if clusterID == "" {
|
||||
writeError(w, http.StatusBadRequest, "cluster_id parameter required")
|
||||
return
|
||||
}
|
||||
|
||||
ctx := r.Context()
|
||||
status, err := h.clusterManager.GetClusterStatus(ctx, clusterID)
|
||||
if err != nil {
|
||||
h.logger.Error("Failed to get cluster status",
|
||||
zap.String("cluster_id", clusterID),
|
||||
zap.Error(err),
|
||||
)
|
||||
writeError(w, http.StatusNotFound, "cluster not found")
|
||||
return
|
||||
}
|
||||
|
||||
resp := StatusResponse{
|
||||
ClusterID: status.ClusterID,
|
||||
Namespace: status.Namespace,
|
||||
Status: string(status.Status),
|
||||
Nodes: status.Nodes,
|
||||
RQLiteReady: status.RQLiteReady,
|
||||
OlricReady: status.OlricReady,
|
||||
GatewayReady: status.GatewayReady,
|
||||
DNSReady: status.DNSReady,
|
||||
Error: status.Error,
|
||||
}
|
||||
|
||||
// Include gateway URL when ready
|
||||
if status.Status == ns.ClusterStatusReady {
|
||||
// Gateway URL would be constructed from cluster configuration
|
||||
// For now, we'll leave it empty and let the client construct it
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusOK, resp)
|
||||
}
|
||||
|
||||
// HandleByName handles GET /v1/namespace/status/name/{namespace}
|
||||
func (h *StatusHandler) HandleByName(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodGet {
|
||||
writeError(w, http.StatusMethodNotAllowed, "method not allowed")
|
||||
return
|
||||
}
|
||||
|
||||
// Extract namespace from path
|
||||
path := r.URL.Path
|
||||
namespace := ""
|
||||
const prefix = "/v1/namespace/status/name/"
|
||||
if len(path) > len(prefix) {
|
||||
namespace = path[len(prefix):]
|
||||
}
|
||||
|
||||
if namespace == "" {
|
||||
writeError(w, http.StatusBadRequest, "namespace parameter required")
|
||||
return
|
||||
}
|
||||
|
||||
ctx := r.Context()
|
||||
cluster, err := h.clusterManager.GetClusterByNamespaceName(ctx, namespace)
|
||||
if err != nil {
|
||||
h.logger.Debug("Cluster not found for namespace",
|
||||
zap.String("namespace", namespace),
|
||||
zap.Error(err),
|
||||
)
|
||||
writeError(w, http.StatusNotFound, "cluster not found for namespace")
|
||||
return
|
||||
}
|
||||
|
||||
status, err := h.clusterManager.GetClusterStatus(ctx, cluster.ID)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "failed to get cluster status")
|
||||
return
|
||||
}
|
||||
|
||||
resp := StatusResponse{
|
||||
ClusterID: status.ClusterID,
|
||||
Namespace: status.Namespace,
|
||||
Status: string(status.Status),
|
||||
Nodes: status.Nodes,
|
||||
RQLiteReady: status.RQLiteReady,
|
||||
OlricReady: status.OlricReady,
|
||||
GatewayReady: status.GatewayReady,
|
||||
DNSReady: status.DNSReady,
|
||||
Error: status.Error,
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusOK, resp)
|
||||
}
|
||||
|
||||
// ProvisionRequest represents a request to provision a new namespace cluster
|
||||
type ProvisionRequest struct {
|
||||
Namespace string `json:"namespace"`
|
||||
ProvisionedBy string `json:"provisioned_by"` // Wallet address
|
||||
}
|
||||
|
||||
// ProvisionResponse represents the response when provisioning starts
|
||||
type ProvisionResponse struct {
|
||||
Status string `json:"status"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
PollURL string `json:"poll_url"`
|
||||
EstimatedTimeSeconds int `json:"estimated_time_seconds"`
|
||||
}
|
||||
|
||||
// HandleProvision handles POST /v1/namespace/provision
|
||||
func (h *StatusHandler) HandleProvision(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
writeError(w, http.StatusMethodNotAllowed, "method not allowed")
|
||||
return
|
||||
}
|
||||
|
||||
var req ProvisionRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid json body")
|
||||
return
|
||||
}
|
||||
|
||||
if req.Namespace == "" || req.ProvisionedBy == "" {
|
||||
writeError(w, http.StatusBadRequest, "namespace and provisioned_by are required")
|
||||
return
|
||||
}
|
||||
|
||||
// Don't allow provisioning the "default" namespace this way
|
||||
if req.Namespace == "default" {
|
||||
writeError(w, http.StatusBadRequest, "cannot provision the default namespace")
|
||||
return
|
||||
}
|
||||
|
||||
ctx := r.Context()
|
||||
|
||||
// Check if namespace exists
|
||||
// For now, we assume the namespace ID is passed or we look it up
|
||||
// This would typically be done through the auth service
|
||||
// For simplicity, we'll use a placeholder namespace ID
|
||||
|
||||
h.logger.Info("Namespace provisioning requested",
|
||||
zap.String("namespace", req.Namespace),
|
||||
zap.String("provisioned_by", req.ProvisionedBy),
|
||||
)
|
||||
|
||||
// Note: In a full implementation, we'd look up the namespace ID from the database
|
||||
// For now, we'll create a placeholder that indicates provisioning should happen
|
||||
// The actual provisioning is triggered through the auth flow
|
||||
|
||||
writeJSON(w, http.StatusAccepted, map[string]interface{}{
|
||||
"status": "accepted",
|
||||
"message": "Provisioning request accepted. Use auth flow to provision namespace cluster.",
|
||||
})
|
||||
}
|
||||
|
||||
func writeJSON(w http.ResponseWriter, status int, data interface{}) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(status)
|
||||
json.NewEncoder(w).Encode(data)
|
||||
}
|
||||
|
||||
func writeError(w http.ResponseWriter, status int, message string) {
|
||||
writeJSON(w, status, map[string]string{"error": message})
|
||||
}
|
||||
469
pkg/gateway/instance_spawner.go
Normal file
469
pkg/gateway/instance_spawner.go
Normal file
@ -0,0 +1,469 @@
|
||||
package gateway
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/tlsutil"
|
||||
"go.uber.org/zap"
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// InstanceNodeStatus represents the status of an instance (local type to avoid import cycle)
|
||||
type InstanceNodeStatus string
|
||||
|
||||
const (
|
||||
InstanceStatusPending InstanceNodeStatus = "pending"
|
||||
InstanceStatusStarting InstanceNodeStatus = "starting"
|
||||
InstanceStatusRunning InstanceNodeStatus = "running"
|
||||
InstanceStatusStopped InstanceNodeStatus = "stopped"
|
||||
InstanceStatusFailed InstanceNodeStatus = "failed"
|
||||
)
|
||||
|
||||
// InstanceError represents an error during instance operations (local type to avoid import cycle)
|
||||
type InstanceError struct {
|
||||
Message string
|
||||
Cause error
|
||||
}
|
||||
|
||||
func (e *InstanceError) Error() string {
|
||||
if e.Cause != nil {
|
||||
return e.Message + ": " + e.Cause.Error()
|
||||
}
|
||||
return e.Message
|
||||
}
|
||||
|
||||
func (e *InstanceError) Unwrap() error {
|
||||
return e.Cause
|
||||
}
|
||||
|
||||
// InstanceSpawner manages multiple Gateway instances for namespace clusters.
|
||||
// Each namespace gets its own gateway instances that connect to its dedicated RQLite and Olric clusters.
|
||||
type InstanceSpawner struct {
|
||||
logger *zap.Logger
|
||||
baseDir string // Base directory for all namespace data (e.g., ~/.orama/data/namespaces)
|
||||
instances map[string]*GatewayInstance
|
||||
mu sync.RWMutex
|
||||
}
|
||||
|
||||
// GatewayInstance represents a running Gateway instance for a namespace
|
||||
type GatewayInstance struct {
|
||||
Namespace string
|
||||
NodeID string
|
||||
HTTPPort int
|
||||
BaseDomain string
|
||||
RQLiteDSN string // Connection to namespace RQLite
|
||||
OlricServers []string // Connection to namespace Olric
|
||||
ConfigPath string
|
||||
PID int
|
||||
Status InstanceNodeStatus
|
||||
StartedAt time.Time
|
||||
LastHealthCheck time.Time
|
||||
cmd *exec.Cmd
|
||||
logger *zap.Logger
|
||||
}
|
||||
|
||||
// InstanceConfig holds configuration for spawning a Gateway instance
|
||||
type InstanceConfig struct {
|
||||
Namespace string // Namespace name (e.g., "alice")
|
||||
NodeID string // Physical node ID
|
||||
HTTPPort int // HTTP API port
|
||||
BaseDomain string // Base domain (e.g., "devnet-orama.network")
|
||||
RQLiteDSN string // RQLite connection DSN (e.g., "http://localhost:10000")
|
||||
OlricServers []string // Olric server addresses
|
||||
NodePeerID string // Physical node's peer ID for home node management
|
||||
DataDir string // Data directory for deployments, SQLite, etc.
|
||||
}
|
||||
|
||||
// GatewayYAMLConfig represents the gateway YAML configuration structure
|
||||
type GatewayYAMLConfig struct {
|
||||
ListenAddr string `yaml:"listen_addr"`
|
||||
ClientNamespace string `yaml:"client_namespace"`
|
||||
RQLiteDSN string `yaml:"rqlite_dsn"`
|
||||
OlricServers []string `yaml:"olric_servers"`
|
||||
BaseDomain string `yaml:"base_domain"`
|
||||
NodePeerID string `yaml:"node_peer_id"`
|
||||
DataDir string `yaml:"data_dir"`
|
||||
}
|
||||
|
||||
// NewInstanceSpawner creates a new Gateway instance spawner
|
||||
func NewInstanceSpawner(baseDir string, logger *zap.Logger) *InstanceSpawner {
|
||||
return &InstanceSpawner{
|
||||
logger: logger.With(zap.String("component", "gateway-instance-spawner")),
|
||||
baseDir: baseDir,
|
||||
instances: make(map[string]*GatewayInstance),
|
||||
}
|
||||
}
|
||||
|
||||
// instanceKey generates a unique key for an instance based on namespace and node
|
||||
func instanceKey(ns, nodeID string) string {
|
||||
return fmt.Sprintf("%s:%s", ns, nodeID)
|
||||
}
|
||||
|
||||
// SpawnInstance starts a new Gateway instance for a namespace on a specific node.
|
||||
// Returns the instance info or an error if spawning fails.
|
||||
func (is *InstanceSpawner) SpawnInstance(ctx context.Context, cfg InstanceConfig) (*GatewayInstance, error) {
|
||||
key := instanceKey(cfg.Namespace, cfg.NodeID)
|
||||
|
||||
is.mu.Lock()
|
||||
if existing, ok := is.instances[key]; ok {
|
||||
is.mu.Unlock()
|
||||
// Instance already exists, return it if running
|
||||
if existing.Status == InstanceStatusRunning {
|
||||
return existing, nil
|
||||
}
|
||||
// Otherwise, remove it and start fresh
|
||||
is.mu.Lock()
|
||||
delete(is.instances, key)
|
||||
}
|
||||
is.mu.Unlock()
|
||||
|
||||
// Create config and logs directories
|
||||
configDir := filepath.Join(is.baseDir, cfg.Namespace, "configs")
|
||||
logsDir := filepath.Join(is.baseDir, cfg.Namespace, "logs")
|
||||
dataDir := filepath.Join(is.baseDir, cfg.Namespace, "data")
|
||||
|
||||
for _, dir := range []string{configDir, logsDir, dataDir} {
|
||||
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||
return nil, &InstanceError{
|
||||
Message: fmt.Sprintf("failed to create directory %s", dir),
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Generate config file
|
||||
configPath := filepath.Join(configDir, fmt.Sprintf("gateway-%s.yaml", cfg.NodeID))
|
||||
if err := is.generateConfig(configPath, cfg, dataDir); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
instance := &GatewayInstance{
|
||||
Namespace: cfg.Namespace,
|
||||
NodeID: cfg.NodeID,
|
||||
HTTPPort: cfg.HTTPPort,
|
||||
BaseDomain: cfg.BaseDomain,
|
||||
RQLiteDSN: cfg.RQLiteDSN,
|
||||
OlricServers: cfg.OlricServers,
|
||||
ConfigPath: configPath,
|
||||
Status: InstanceStatusStarting,
|
||||
logger: is.logger.With(zap.String("namespace", cfg.Namespace), zap.String("node_id", cfg.NodeID)),
|
||||
}
|
||||
|
||||
instance.logger.Info("Starting Gateway instance",
|
||||
zap.Int("http_port", cfg.HTTPPort),
|
||||
zap.String("rqlite_dsn", cfg.RQLiteDSN),
|
||||
zap.Strings("olric_servers", cfg.OlricServers),
|
||||
)
|
||||
|
||||
// Find the gateway binary (should be in same directory as the current process or PATH)
|
||||
gatewayBinary := "gateway"
|
||||
|
||||
// Create command
|
||||
cmd := exec.CommandContext(ctx, gatewayBinary, "--config", configPath)
|
||||
instance.cmd = cmd
|
||||
|
||||
// Setup logging
|
||||
logPath := filepath.Join(logsDir, fmt.Sprintf("gateway-%s.log", cfg.NodeID))
|
||||
logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
|
||||
if err != nil {
|
||||
return nil, &InstanceError{
|
||||
Message: "failed to open log file",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
cmd.Stdout = logFile
|
||||
cmd.Stderr = logFile
|
||||
|
||||
// Start the process
|
||||
if err := cmd.Start(); err != nil {
|
||||
logFile.Close()
|
||||
return nil, &InstanceError{
|
||||
Message: "failed to start Gateway process",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
logFile.Close()
|
||||
|
||||
instance.PID = cmd.Process.Pid
|
||||
instance.StartedAt = time.Now()
|
||||
|
||||
// Store instance
|
||||
is.mu.Lock()
|
||||
is.instances[key] = instance
|
||||
is.mu.Unlock()
|
||||
|
||||
// Wait for instance to be ready
|
||||
if err := is.waitForInstanceReady(ctx, instance); err != nil {
|
||||
// Kill the process on failure
|
||||
if cmd.Process != nil {
|
||||
_ = cmd.Process.Kill()
|
||||
}
|
||||
is.mu.Lock()
|
||||
delete(is.instances, key)
|
||||
is.mu.Unlock()
|
||||
return nil, &InstanceError{
|
||||
Message: "Gateway instance did not become ready",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
instance.Status = InstanceStatusRunning
|
||||
instance.LastHealthCheck = time.Now()
|
||||
|
||||
instance.logger.Info("Gateway instance started successfully",
|
||||
zap.Int("pid", instance.PID),
|
||||
)
|
||||
|
||||
// Start background process monitor
|
||||
go is.monitorInstance(instance)
|
||||
|
||||
return instance, nil
|
||||
}
|
||||
|
||||
// generateConfig generates the Gateway YAML configuration file
|
||||
func (is *InstanceSpawner) generateConfig(configPath string, cfg InstanceConfig, dataDir string) error {
|
||||
gatewayCfg := GatewayYAMLConfig{
|
||||
ListenAddr: fmt.Sprintf(":%d", cfg.HTTPPort),
|
||||
ClientNamespace: cfg.Namespace,
|
||||
RQLiteDSN: cfg.RQLiteDSN,
|
||||
OlricServers: cfg.OlricServers,
|
||||
BaseDomain: cfg.BaseDomain,
|
||||
NodePeerID: cfg.NodePeerID,
|
||||
DataDir: dataDir,
|
||||
}
|
||||
|
||||
data, err := yaml.Marshal(gatewayCfg)
|
||||
if err != nil {
|
||||
return &InstanceError{
|
||||
Message: "failed to marshal Gateway config",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
if err := os.WriteFile(configPath, data, 0644); err != nil {
|
||||
return &InstanceError{
|
||||
Message: "failed to write Gateway config",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// StopInstance stops a Gateway instance for a namespace on a specific node
|
||||
func (is *InstanceSpawner) StopInstance(ctx context.Context, ns, nodeID string) error {
|
||||
key := instanceKey(ns, nodeID)
|
||||
|
||||
is.mu.Lock()
|
||||
instance, ok := is.instances[key]
|
||||
if !ok {
|
||||
is.mu.Unlock()
|
||||
return nil // Already stopped
|
||||
}
|
||||
delete(is.instances, key)
|
||||
is.mu.Unlock()
|
||||
|
||||
if instance.cmd != nil && instance.cmd.Process != nil {
|
||||
instance.logger.Info("Stopping Gateway instance", zap.Int("pid", instance.PID))
|
||||
|
||||
// Send SIGTERM for graceful shutdown
|
||||
if err := instance.cmd.Process.Signal(os.Interrupt); err != nil {
|
||||
// If SIGTERM fails, kill it
|
||||
_ = instance.cmd.Process.Kill()
|
||||
}
|
||||
|
||||
// Wait for process to exit with timeout
|
||||
done := make(chan error, 1)
|
||||
go func() {
|
||||
done <- instance.cmd.Wait()
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-done:
|
||||
instance.logger.Info("Gateway instance stopped gracefully")
|
||||
case <-time.After(10 * time.Second):
|
||||
instance.logger.Warn("Gateway instance did not stop gracefully, killing")
|
||||
_ = instance.cmd.Process.Kill()
|
||||
case <-ctx.Done():
|
||||
_ = instance.cmd.Process.Kill()
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
|
||||
instance.Status = InstanceStatusStopped
|
||||
return nil
|
||||
}
|
||||
|
||||
// StopAllInstances stops all Gateway instances for a namespace
|
||||
func (is *InstanceSpawner) StopAllInstances(ctx context.Context, ns string) error {
|
||||
is.mu.RLock()
|
||||
var keys []string
|
||||
for key, inst := range is.instances {
|
||||
if inst.Namespace == ns {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
}
|
||||
is.mu.RUnlock()
|
||||
|
||||
var lastErr error
|
||||
for _, key := range keys {
|
||||
parts := strings.SplitN(key, ":", 2)
|
||||
if len(parts) == 2 {
|
||||
if err := is.StopInstance(ctx, parts[0], parts[1]); err != nil {
|
||||
lastErr = err
|
||||
}
|
||||
}
|
||||
}
|
||||
return lastErr
|
||||
}
|
||||
|
||||
// GetInstance returns the instance for a namespace on a specific node
|
||||
func (is *InstanceSpawner) GetInstance(ns, nodeID string) (*GatewayInstance, bool) {
|
||||
is.mu.RLock()
|
||||
defer is.mu.RUnlock()
|
||||
|
||||
instance, ok := is.instances[instanceKey(ns, nodeID)]
|
||||
return instance, ok
|
||||
}
|
||||
|
||||
// GetNamespaceInstances returns all instances for a namespace
|
||||
func (is *InstanceSpawner) GetNamespaceInstances(ns string) []*GatewayInstance {
|
||||
is.mu.RLock()
|
||||
defer is.mu.RUnlock()
|
||||
|
||||
var instances []*GatewayInstance
|
||||
for _, inst := range is.instances {
|
||||
if inst.Namespace == ns {
|
||||
instances = append(instances, inst)
|
||||
}
|
||||
}
|
||||
return instances
|
||||
}
|
||||
|
||||
// HealthCheck checks if an instance is healthy
|
||||
func (is *InstanceSpawner) HealthCheck(ctx context.Context, ns, nodeID string) (bool, error) {
|
||||
instance, ok := is.GetInstance(ns, nodeID)
|
||||
if !ok {
|
||||
return false, &InstanceError{Message: "instance not found"}
|
||||
}
|
||||
|
||||
healthy, err := instance.IsHealthy(ctx)
|
||||
if healthy {
|
||||
is.mu.Lock()
|
||||
instance.LastHealthCheck = time.Now()
|
||||
is.mu.Unlock()
|
||||
}
|
||||
return healthy, err
|
||||
}
|
||||
|
||||
// waitForInstanceReady waits for the Gateway instance to be ready
|
||||
func (is *InstanceSpawner) waitForInstanceReady(ctx context.Context, instance *GatewayInstance) error {
|
||||
client := tlsutil.NewHTTPClient(2 * time.Second)
|
||||
|
||||
// Gateway health check endpoint
|
||||
url := fmt.Sprintf("http://localhost:%d/v1/health", instance.HTTPPort)
|
||||
|
||||
maxAttempts := 120 // 2 minutes
|
||||
for i := 0; i < maxAttempts; i++ {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-time.After(1 * time.Second):
|
||||
}
|
||||
|
||||
resp, err := client.Get(url)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
resp.Body.Close()
|
||||
|
||||
if resp.StatusCode == http.StatusOK {
|
||||
instance.logger.Debug("Gateway instance ready",
|
||||
zap.Int("attempts", i+1),
|
||||
)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
return fmt.Errorf("Gateway did not become ready within timeout")
|
||||
}
|
||||
|
||||
// monitorInstance monitors an instance and updates its status
|
||||
func (is *InstanceSpawner) monitorInstance(instance *GatewayInstance) {
|
||||
ticker := time.NewTicker(30 * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for range ticker.C {
|
||||
is.mu.RLock()
|
||||
key := instanceKey(instance.Namespace, instance.NodeID)
|
||||
_, exists := is.instances[key]
|
||||
is.mu.RUnlock()
|
||||
|
||||
if !exists {
|
||||
// Instance was removed
|
||||
return
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
healthy, _ := instance.IsHealthy(ctx)
|
||||
cancel()
|
||||
|
||||
is.mu.Lock()
|
||||
if healthy {
|
||||
instance.Status = InstanceStatusRunning
|
||||
instance.LastHealthCheck = time.Now()
|
||||
} else {
|
||||
instance.Status = InstanceStatusFailed
|
||||
instance.logger.Warn("Gateway instance health check failed")
|
||||
}
|
||||
is.mu.Unlock()
|
||||
|
||||
// Check if process is still running
|
||||
if instance.cmd != nil && instance.cmd.ProcessState != nil && instance.cmd.ProcessState.Exited() {
|
||||
is.mu.Lock()
|
||||
instance.Status = InstanceStatusStopped
|
||||
is.mu.Unlock()
|
||||
instance.logger.Warn("Gateway instance process exited unexpectedly")
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// IsHealthy checks if the Gateway instance is healthy
|
||||
func (gi *GatewayInstance) IsHealthy(ctx context.Context) (bool, error) {
|
||||
url := fmt.Sprintf("http://localhost:%d/v1/health", gi.HTTPPort)
|
||||
client := tlsutil.NewHTTPClient(5 * time.Second)
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
return resp.StatusCode == http.StatusOK, nil
|
||||
}
|
||||
|
||||
// DSN returns the local connection address for this Gateway instance
|
||||
func (gi *GatewayInstance) DSN() string {
|
||||
return fmt.Sprintf("http://localhost:%d", gi.HTTPPort)
|
||||
}
|
||||
|
||||
// ExternalURL returns the external URL for accessing this namespace's gateway
|
||||
func (gi *GatewayInstance) ExternalURL() string {
|
||||
return fmt.Sprintf("https://ns-%s.%s", gi.Namespace, gi.BaseDomain)
|
||||
}
|
||||
@ -207,6 +207,9 @@ func isPublicPath(p string) bool {
|
||||
|
||||
// authorizationMiddleware enforces that the authenticated actor owns the namespace
|
||||
// for certain protected paths (e.g., apps CRUD and storage APIs).
|
||||
// Also enforces cross-namespace access control:
|
||||
// - "default" namespace: accessible by any valid API key
|
||||
// - Other namespaces: API key must belong to that specific namespace
|
||||
func (g *Gateway) authorizationMiddleware(next http.Handler) http.Handler {
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
// Skip for public/OPTIONS paths only
|
||||
@ -221,7 +224,40 @@ func (g *Gateway) authorizationMiddleware(next http.Handler) http.Handler {
|
||||
return
|
||||
}
|
||||
|
||||
// Only enforce for specific resource paths
|
||||
// Exempt namespace status endpoint
|
||||
if strings.HasPrefix(r.URL.Path, "/v1/namespace/status") {
|
||||
next.ServeHTTP(w, r)
|
||||
return
|
||||
}
|
||||
|
||||
// Cross-namespace access control for namespace gateways
|
||||
// The gateway's ClientNamespace determines which namespace this gateway serves
|
||||
gatewayNamespace := "default"
|
||||
if g.cfg != nil && g.cfg.ClientNamespace != "" {
|
||||
gatewayNamespace = strings.TrimSpace(g.cfg.ClientNamespace)
|
||||
}
|
||||
|
||||
// Get user's namespace from context (derived from API key/JWT)
|
||||
userNamespace := ""
|
||||
if v := r.Context().Value(CtxKeyNamespaceOverride); v != nil {
|
||||
if s, ok := v.(string); ok {
|
||||
userNamespace = strings.TrimSpace(s)
|
||||
}
|
||||
}
|
||||
|
||||
// For non-default namespace gateways, the API key must belong to this namespace
|
||||
// This enforces physical isolation: alice's gateway only accepts alice's API keys
|
||||
if gatewayNamespace != "default" && userNamespace != "" && userNamespace != gatewayNamespace {
|
||||
g.logger.ComponentWarn(logging.ComponentGeneral, "cross-namespace access denied",
|
||||
zap.String("user_namespace", userNamespace),
|
||||
zap.String("gateway_namespace", gatewayNamespace),
|
||||
zap.String("path", r.URL.Path),
|
||||
)
|
||||
writeError(w, http.StatusForbidden, "API key does not belong to this namespace")
|
||||
return
|
||||
}
|
||||
|
||||
// Only enforce ownership for specific resource paths
|
||||
if !requiresNamespaceOwnership(r.URL.Path) {
|
||||
next.ServeHTTP(w, r)
|
||||
return
|
||||
@ -433,8 +469,14 @@ func getClientIP(r *http.Request) string {
|
||||
return host
|
||||
}
|
||||
|
||||
// domainRoutingMiddleware handles requests to deployment domains
|
||||
// domainRoutingMiddleware handles requests to deployment domains and namespace gateways
|
||||
// This must come BEFORE auth middleware so deployment domains work without API keys
|
||||
//
|
||||
// Domain routing patterns:
|
||||
// - ns-{namespace}.{baseDomain} -> Namespace gateway (proxy to namespace cluster)
|
||||
// - {name}-{random}.{baseDomain} -> Deployment domain
|
||||
// - {name}.{baseDomain} -> Deployment domain (legacy)
|
||||
// - {name}.node-xxx.{baseDomain} -> Legacy format (deprecated, returns 404 for new deployments)
|
||||
func (g *Gateway) domainRoutingMiddleware(next http.Handler) http.Handler {
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
host := strings.Split(r.Host, ":")[0] // Strip port
|
||||
@ -446,7 +488,7 @@ func (g *Gateway) domainRoutingMiddleware(next http.Handler) http.Handler {
|
||||
}
|
||||
|
||||
// Only process base domain and its subdomains
|
||||
if !strings.HasSuffix(host, "."+baseDomain) {
|
||||
if !strings.HasSuffix(host, "."+baseDomain) && host != baseDomain {
|
||||
next.ServeHTTP(w, r)
|
||||
return
|
||||
}
|
||||
@ -457,6 +499,18 @@ func (g *Gateway) domainRoutingMiddleware(next http.Handler) http.Handler {
|
||||
return
|
||||
}
|
||||
|
||||
// Check for namespace gateway domain: ns-{namespace}.{baseDomain}
|
||||
suffix := "." + baseDomain
|
||||
if strings.HasSuffix(host, suffix) {
|
||||
subdomain := strings.TrimSuffix(host, suffix)
|
||||
if strings.HasPrefix(subdomain, "ns-") {
|
||||
// This is a namespace gateway request
|
||||
namespaceName := strings.TrimPrefix(subdomain, "ns-")
|
||||
g.handleNamespaceGatewayRequest(w, r, namespaceName)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Check if deployment handlers are available
|
||||
if g.deploymentService == nil || g.staticHandler == nil {
|
||||
next.ServeHTTP(w, r)
|
||||
@ -470,7 +524,7 @@ func (g *Gateway) domainRoutingMiddleware(next http.Handler) http.Handler {
|
||||
return
|
||||
}
|
||||
if deployment == nil {
|
||||
// Domain matches .orama.network but no deployment found
|
||||
// Domain matches .{baseDomain} but no deployment found
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
@ -490,9 +544,112 @@ func (g *Gateway) domainRoutingMiddleware(next http.Handler) http.Handler {
|
||||
})
|
||||
}
|
||||
|
||||
// handleNamespaceGatewayRequest proxies requests to a namespace's dedicated gateway cluster
|
||||
// This enables physical isolation where each namespace has its own RQLite, Olric, and Gateway
|
||||
func (g *Gateway) handleNamespaceGatewayRequest(w http.ResponseWriter, r *http.Request, namespaceName string) {
|
||||
// Look up namespace cluster gateway IPs from DNS records
|
||||
db := g.client.Database()
|
||||
internalCtx := client.WithInternalAuth(r.Context())
|
||||
|
||||
baseDomain := "dbrs.space"
|
||||
if g.cfg != nil && g.cfg.BaseDomain != "" {
|
||||
baseDomain = g.cfg.BaseDomain
|
||||
}
|
||||
|
||||
// Query DNS records for the namespace gateway
|
||||
fqdn := "ns-" + namespaceName + "." + baseDomain + "."
|
||||
query := `SELECT value FROM dns_records WHERE fqdn = ? AND record_type = 'A' AND is_active = TRUE ORDER BY RANDOM() LIMIT 1`
|
||||
result, err := db.Query(internalCtx, query, fqdn)
|
||||
if err != nil || result == nil || len(result.Rows) == 0 {
|
||||
// No gateway found for this namespace
|
||||
g.logger.ComponentWarn(logging.ComponentGeneral, "namespace gateway not found",
|
||||
zap.String("namespace", namespaceName),
|
||||
zap.String("fqdn", fqdn),
|
||||
)
|
||||
http.Error(w, "Namespace gateway not found", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
|
||||
gatewayIP := getString(result.Rows[0][0])
|
||||
if gatewayIP == "" {
|
||||
http.Error(w, "Namespace gateway not available", http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
|
||||
// Get the gateway port from namespace_port_allocations
|
||||
// Gateway HTTP port is port_start + 4
|
||||
portQuery := `
|
||||
SELECT npa.gateway_http_port
|
||||
FROM namespace_port_allocations npa
|
||||
JOIN namespace_clusters nc ON npa.namespace_cluster_id = nc.id
|
||||
WHERE nc.namespace_name = ?
|
||||
LIMIT 1
|
||||
`
|
||||
portResult, err := db.Query(internalCtx, portQuery, namespaceName)
|
||||
gatewayPort := 10004 // Default to first namespace's gateway port
|
||||
if err == nil && portResult != nil && len(portResult.Rows) > 0 {
|
||||
if p := getInt(portResult.Rows[0][0]); p > 0 {
|
||||
gatewayPort = p
|
||||
}
|
||||
}
|
||||
|
||||
// Proxy request to the namespace gateway
|
||||
targetURL := "http://" + gatewayIP + ":" + strconv.Itoa(gatewayPort) + r.URL.Path
|
||||
if r.URL.RawQuery != "" {
|
||||
targetURL += "?" + r.URL.RawQuery
|
||||
}
|
||||
|
||||
proxyReq, err := http.NewRequest(r.Method, targetURL, r.Body)
|
||||
if err != nil {
|
||||
g.logger.ComponentError(logging.ComponentGeneral, "failed to create namespace gateway proxy request",
|
||||
zap.String("namespace", namespaceName),
|
||||
zap.Error(err),
|
||||
)
|
||||
http.Error(w, "Internal server error", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
// Copy headers
|
||||
for key, values := range r.Header {
|
||||
for _, value := range values {
|
||||
proxyReq.Header.Add(key, value)
|
||||
}
|
||||
}
|
||||
proxyReq.Header.Set("X-Forwarded-For", getClientIP(r))
|
||||
proxyReq.Header.Set("X-Forwarded-Proto", "https")
|
||||
proxyReq.Header.Set("X-Forwarded-Host", r.Host)
|
||||
proxyReq.Header.Set("X-Original-Host", r.Host)
|
||||
|
||||
// Execute proxy request
|
||||
httpClient := &http.Client{Timeout: 30 * time.Second}
|
||||
resp, err := httpClient.Do(proxyReq)
|
||||
if err != nil {
|
||||
g.logger.ComponentError(logging.ComponentGeneral, "namespace gateway proxy request failed",
|
||||
zap.String("namespace", namespaceName),
|
||||
zap.String("target", gatewayIP),
|
||||
zap.Error(err),
|
||||
)
|
||||
http.Error(w, "Namespace gateway unavailable", http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Copy response headers
|
||||
for key, values := range resp.Header {
|
||||
for _, value := range values {
|
||||
w.Header().Add(key, value)
|
||||
}
|
||||
}
|
||||
|
||||
// Write status code and body
|
||||
w.WriteHeader(resp.StatusCode)
|
||||
io.Copy(w, resp.Body)
|
||||
}
|
||||
|
||||
// getDeploymentByDomain looks up a deployment by its domain
|
||||
// Supports formats like:
|
||||
// - {name}.{baseDomain} (e.g., myapp.dbrs.space) - primary format
|
||||
// - {name}-{random}.{baseDomain} (e.g., myapp-f3o4if.dbrs.space) - new format with random suffix
|
||||
// - {name}.{baseDomain} (e.g., myapp.dbrs.space) - legacy format (backwards compatibility)
|
||||
// - {name}.node-{shortID}.{baseDomain} (legacy format for backwards compatibility)
|
||||
// - custom domains via deployment_domains table
|
||||
func (g *Gateway) getDeploymentByDomain(ctx context.Context, domain string) (*deployments.Deployment, error) {
|
||||
@ -512,25 +669,28 @@ func (g *Gateway) getDeploymentByDomain(ctx context.Context, domain string) (*de
|
||||
db := g.client.Database()
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
// Parse domain to extract deployment name
|
||||
// Parse domain to extract deployment subdomain/name
|
||||
suffix := "." + baseDomain
|
||||
if strings.HasSuffix(domain, suffix) {
|
||||
subdomain := strings.TrimSuffix(domain, suffix)
|
||||
parts := strings.Split(subdomain, ".")
|
||||
|
||||
// Primary format: {name}.{baseDomain} (e.g., myapp.dbrs.space)
|
||||
// Primary format: {subdomain}.{baseDomain} (e.g., myapp-f3o4if.dbrs.space)
|
||||
// The subdomain can be either:
|
||||
// - {name}-{random} (new format)
|
||||
// - {name} (legacy format)
|
||||
if len(parts) == 1 {
|
||||
deploymentName := parts[0]
|
||||
subdomainOrName := parts[0]
|
||||
|
||||
// Query by name
|
||||
// First, try to find by subdomain (new format: name-random)
|
||||
query := `
|
||||
SELECT id, namespace, name, type, port, content_cid, status, home_node_id
|
||||
SELECT id, namespace, name, type, port, content_cid, status, home_node_id, subdomain
|
||||
FROM deployments
|
||||
WHERE name = ?
|
||||
WHERE subdomain = ?
|
||||
AND status = 'active'
|
||||
LIMIT 1
|
||||
`
|
||||
result, err := db.Query(internalCtx, query, deploymentName)
|
||||
result, err := db.Query(internalCtx, query, subdomainOrName)
|
||||
if err == nil && len(result.Rows) > 0 {
|
||||
row := result.Rows[0]
|
||||
return &deployments.Deployment{
|
||||
@ -542,6 +702,31 @@ func (g *Gateway) getDeploymentByDomain(ctx context.Context, domain string) (*de
|
||||
ContentCID: getString(row[5]),
|
||||
Status: deployments.DeploymentStatus(getString(row[6])),
|
||||
HomeNodeID: getString(row[7]),
|
||||
Subdomain: getString(row[8]),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Fallback: try by name for legacy deployments (without random suffix)
|
||||
query = `
|
||||
SELECT id, namespace, name, type, port, content_cid, status, home_node_id, subdomain
|
||||
FROM deployments
|
||||
WHERE name = ?
|
||||
AND status = 'active'
|
||||
LIMIT 1
|
||||
`
|
||||
result, err = db.Query(internalCtx, query, subdomainOrName)
|
||||
if err == nil && len(result.Rows) > 0 {
|
||||
row := result.Rows[0]
|
||||
return &deployments.Deployment{
|
||||
ID: getString(row[0]),
|
||||
Namespace: getString(row[1]),
|
||||
Name: getString(row[2]),
|
||||
Type: deployments.DeploymentType(getString(row[3])),
|
||||
Port: getInt(row[4]),
|
||||
ContentCID: getString(row[5]),
|
||||
Status: deployments.DeploymentStatus(getString(row[6])),
|
||||
HomeNodeID: getString(row[7]),
|
||||
Subdomain: getString(row[8]),
|
||||
}, nil
|
||||
}
|
||||
}
|
||||
|
||||
659
pkg/namespace/cluster_manager.go
Normal file
659
pkg/namespace/cluster_manager.go
Normal file
@ -0,0 +1,659 @@
|
||||
package namespace
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/client"
|
||||
"github.com/DeBrosOfficial/network/pkg/gateway"
|
||||
"github.com/DeBrosOfficial/network/pkg/olric"
|
||||
"github.com/DeBrosOfficial/network/pkg/rqlite"
|
||||
rqliteClient "github.com/DeBrosOfficial/network/pkg/rqlite"
|
||||
"github.com/google/uuid"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// ClusterManager orchestrates namespace cluster provisioning and lifecycle management.
|
||||
// It coordinates the creation and teardown of RQLite, Olric, and Gateway instances
|
||||
// for each namespace's dedicated cluster.
|
||||
type ClusterManager struct {
|
||||
db rqliteClient.Client
|
||||
portAllocator *NamespacePortAllocator
|
||||
nodeSelector *ClusterNodeSelector
|
||||
rqliteSpawner *rqlite.InstanceSpawner
|
||||
olricSpawner *olric.InstanceSpawner
|
||||
gatewaySpawner *gateway.InstanceSpawner
|
||||
dnsManager *DNSRecordManager
|
||||
baseDomain string
|
||||
baseDataDir string // Base directory for namespace data (e.g., ~/.orama/data/namespaces)
|
||||
logger *zap.Logger
|
||||
}
|
||||
|
||||
// ClusterManagerConfig holds configuration for the ClusterManager
|
||||
type ClusterManagerConfig struct {
|
||||
BaseDomain string // e.g., "devnet-orama.network"
|
||||
BaseDataDir string // e.g., "~/.orama/data/namespaces"
|
||||
}
|
||||
|
||||
// NewClusterManager creates a new cluster manager
|
||||
func NewClusterManager(
|
||||
db rqliteClient.Client,
|
||||
cfg ClusterManagerConfig,
|
||||
logger *zap.Logger,
|
||||
) *ClusterManager {
|
||||
portAllocator := NewNamespacePortAllocator(db, logger)
|
||||
|
||||
return &ClusterManager{
|
||||
db: db,
|
||||
portAllocator: portAllocator,
|
||||
nodeSelector: NewClusterNodeSelector(db, portAllocator, logger),
|
||||
rqliteSpawner: rqlite.NewInstanceSpawner(cfg.BaseDataDir, logger),
|
||||
olricSpawner: olric.NewInstanceSpawner(cfg.BaseDataDir, logger),
|
||||
gatewaySpawner: gateway.NewInstanceSpawner(cfg.BaseDataDir, logger),
|
||||
dnsManager: NewDNSRecordManager(db, cfg.BaseDomain, logger),
|
||||
baseDomain: cfg.BaseDomain,
|
||||
baseDataDir: cfg.BaseDataDir,
|
||||
logger: logger.With(zap.String("component", "cluster-manager")),
|
||||
}
|
||||
}
|
||||
|
||||
// ProvisionCluster provisions a complete namespace cluster (RQLite + Olric + Gateway).
|
||||
// This is an asynchronous operation that returns immediately with a cluster ID.
|
||||
// Use GetClusterStatus to poll for completion.
|
||||
func (cm *ClusterManager) ProvisionCluster(ctx context.Context, namespaceID int, namespaceName, provisionedBy string) (*NamespaceCluster, error) {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
// Check if cluster already exists
|
||||
existing, err := cm.GetClusterByNamespaceID(ctx, namespaceID)
|
||||
if err == nil && existing != nil {
|
||||
if existing.Status == ClusterStatusReady {
|
||||
return existing, nil
|
||||
}
|
||||
if existing.Status == ClusterStatusProvisioning {
|
||||
return existing, nil // Already provisioning
|
||||
}
|
||||
// If failed or deprovisioning, allow re-provisioning
|
||||
}
|
||||
|
||||
// Create cluster record
|
||||
clusterID := uuid.New().String()
|
||||
cluster := &NamespaceCluster{
|
||||
ID: clusterID,
|
||||
NamespaceID: namespaceID,
|
||||
NamespaceName: namespaceName,
|
||||
Status: ClusterStatusProvisioning,
|
||||
RQLiteNodeCount: DefaultRQLiteNodeCount,
|
||||
OlricNodeCount: DefaultOlricNodeCount,
|
||||
GatewayNodeCount: DefaultGatewayNodeCount,
|
||||
ProvisionedBy: provisionedBy,
|
||||
ProvisionedAt: time.Now(),
|
||||
}
|
||||
|
||||
// Insert cluster record
|
||||
insertQuery := `
|
||||
INSERT INTO namespace_clusters (
|
||||
id, namespace_id, namespace_name, status,
|
||||
rqlite_node_count, olric_node_count, gateway_node_count,
|
||||
provisioned_by, provisioned_at
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
`
|
||||
_, err = cm.db.Exec(internalCtx, insertQuery,
|
||||
cluster.ID,
|
||||
cluster.NamespaceID,
|
||||
cluster.NamespaceName,
|
||||
string(cluster.Status),
|
||||
cluster.RQLiteNodeCount,
|
||||
cluster.OlricNodeCount,
|
||||
cluster.GatewayNodeCount,
|
||||
cluster.ProvisionedBy,
|
||||
cluster.ProvisionedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, &ClusterError{
|
||||
Message: "failed to create cluster record",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
// Log provisioning started event
|
||||
cm.logEvent(internalCtx, clusterID, EventProvisioningStarted, "", "Cluster provisioning started", nil)
|
||||
|
||||
// Start async provisioning
|
||||
go cm.doProvisioning(context.Background(), cluster)
|
||||
|
||||
return cluster, nil
|
||||
}
|
||||
|
||||
// doProvisioning performs the actual cluster provisioning asynchronously
|
||||
func (cm *ClusterManager) doProvisioning(ctx context.Context, cluster *NamespaceCluster) {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
cm.logger.Info("Starting cluster provisioning",
|
||||
zap.String("cluster_id", cluster.ID),
|
||||
zap.String("namespace", cluster.NamespaceName),
|
||||
)
|
||||
|
||||
// Step 1: Select nodes for the cluster
|
||||
selectedNodes, err := cm.nodeSelector.SelectNodesForCluster(internalCtx, DefaultRQLiteNodeCount)
|
||||
if err != nil {
|
||||
cm.failCluster(internalCtx, cluster.ID, "Failed to select nodes: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
nodeIDs := make([]string, len(selectedNodes))
|
||||
for i, n := range selectedNodes {
|
||||
nodeIDs[i] = n.NodeID
|
||||
}
|
||||
cm.logEvent(internalCtx, cluster.ID, EventNodesSelected, "", "Selected nodes for cluster", map[string]interface{}{
|
||||
"node_ids": nodeIDs,
|
||||
})
|
||||
|
||||
// Step 2: Allocate port blocks on each node
|
||||
portBlocks := make([]*PortBlock, len(selectedNodes))
|
||||
for i, node := range selectedNodes {
|
||||
block, err := cm.portAllocator.AllocatePortBlock(internalCtx, node.NodeID, cluster.ID)
|
||||
if err != nil {
|
||||
cm.failCluster(internalCtx, cluster.ID, fmt.Sprintf("Failed to allocate ports on node %s: %v", node.NodeID, err))
|
||||
// Cleanup already allocated ports
|
||||
for j := 0; j < i; j++ {
|
||||
_ = cm.portAllocator.DeallocatePortBlock(internalCtx, cluster.ID, selectedNodes[j].NodeID)
|
||||
}
|
||||
return
|
||||
}
|
||||
portBlocks[i] = block
|
||||
cm.logEvent(internalCtx, cluster.ID, EventPortsAllocated, node.NodeID,
|
||||
fmt.Sprintf("Allocated ports %d-%d", block.PortStart, block.PortEnd), nil)
|
||||
}
|
||||
|
||||
// Step 3: Start RQLite instances
|
||||
// First node is the leader, others join it
|
||||
rqliteInstances := make([]*rqlite.RQLiteInstance, len(selectedNodes))
|
||||
|
||||
// Start leader first
|
||||
leaderNode := selectedNodes[0]
|
||||
leaderPorts := portBlocks[0]
|
||||
leaderConfig := rqlite.InstanceConfig{
|
||||
Namespace: cluster.NamespaceName,
|
||||
NodeID: leaderNode.NodeID,
|
||||
HTTPPort: leaderPorts.RQLiteHTTPPort,
|
||||
RaftPort: leaderPorts.RQLiteRaftPort,
|
||||
HTTPAdvAddress: fmt.Sprintf("%s:%d", leaderNode.IPAddress, leaderPorts.RQLiteHTTPPort),
|
||||
RaftAdvAddress: fmt.Sprintf("%s:%d", leaderNode.IPAddress, leaderPorts.RQLiteRaftPort),
|
||||
IsLeader: true,
|
||||
}
|
||||
|
||||
leaderInstance, err := cm.rqliteSpawner.SpawnInstance(internalCtx, leaderConfig)
|
||||
if err != nil {
|
||||
cm.failCluster(internalCtx, cluster.ID, fmt.Sprintf("Failed to start RQLite leader: %v", err))
|
||||
cm.cleanupOnFailure(internalCtx, cluster.ID, selectedNodes, portBlocks)
|
||||
return
|
||||
}
|
||||
rqliteInstances[0] = leaderInstance
|
||||
cm.logEvent(internalCtx, cluster.ID, EventRQLiteStarted, leaderNode.NodeID, "RQLite leader started", nil)
|
||||
|
||||
// Create cluster node record for leader
|
||||
cm.createClusterNodeRecord(internalCtx, cluster.ID, leaderNode.NodeID, NodeRoleRQLiteLeader, leaderPorts, leaderInstance.PID)
|
||||
|
||||
// Start followers and join them to leader
|
||||
leaderJoinAddr := leaderInstance.AdvertisedDSN()
|
||||
for i := 1; i < len(selectedNodes); i++ {
|
||||
node := selectedNodes[i]
|
||||
ports := portBlocks[i]
|
||||
followerConfig := rqlite.InstanceConfig{
|
||||
Namespace: cluster.NamespaceName,
|
||||
NodeID: node.NodeID,
|
||||
HTTPPort: ports.RQLiteHTTPPort,
|
||||
RaftPort: ports.RQLiteRaftPort,
|
||||
HTTPAdvAddress: fmt.Sprintf("%s:%d", node.IPAddress, ports.RQLiteHTTPPort),
|
||||
RaftAdvAddress: fmt.Sprintf("%s:%d", node.IPAddress, ports.RQLiteRaftPort),
|
||||
JoinAddresses: []string{leaderJoinAddr},
|
||||
IsLeader: false,
|
||||
}
|
||||
|
||||
followerInstance, err := cm.rqliteSpawner.SpawnInstance(internalCtx, followerConfig)
|
||||
if err != nil {
|
||||
cm.failCluster(internalCtx, cluster.ID, fmt.Sprintf("Failed to start RQLite follower on node %s: %v", node.NodeID, err))
|
||||
cm.cleanupOnFailure(internalCtx, cluster.ID, selectedNodes, portBlocks)
|
||||
return
|
||||
}
|
||||
rqliteInstances[i] = followerInstance
|
||||
cm.logEvent(internalCtx, cluster.ID, EventRQLiteJoined, node.NodeID, "RQLite follower joined cluster", nil)
|
||||
cm.createClusterNodeRecord(internalCtx, cluster.ID, node.NodeID, NodeRoleRQLiteFollower, ports, followerInstance.PID)
|
||||
}
|
||||
|
||||
cm.logEvent(internalCtx, cluster.ID, EventRQLiteLeaderElected, leaderNode.NodeID, "RQLite cluster formed", nil)
|
||||
|
||||
// Step 4: Start Olric instances
|
||||
// Collect all memberlist addresses for peer discovery
|
||||
olricPeers := make([]string, len(selectedNodes))
|
||||
for i, node := range selectedNodes {
|
||||
olricPeers[i] = fmt.Sprintf("%s:%d", node.IPAddress, portBlocks[i].OlricMemberlistPort)
|
||||
}
|
||||
|
||||
for i, node := range selectedNodes {
|
||||
ports := portBlocks[i]
|
||||
olricConfig := olric.InstanceConfig{
|
||||
Namespace: cluster.NamespaceName,
|
||||
NodeID: node.NodeID,
|
||||
HTTPPort: ports.OlricHTTPPort,
|
||||
MemberlistPort: ports.OlricMemberlistPort,
|
||||
BindAddr: "0.0.0.0",
|
||||
AdvertiseAddr: node.IPAddress,
|
||||
PeerAddresses: olricPeers,
|
||||
}
|
||||
|
||||
_, err := cm.olricSpawner.SpawnInstance(internalCtx, olricConfig)
|
||||
if err != nil {
|
||||
cm.failCluster(internalCtx, cluster.ID, fmt.Sprintf("Failed to start Olric on node %s: %v", node.NodeID, err))
|
||||
cm.cleanupOnFailure(internalCtx, cluster.ID, selectedNodes, portBlocks)
|
||||
return
|
||||
}
|
||||
cm.logEvent(internalCtx, cluster.ID, EventOlricStarted, node.NodeID, "Olric instance started", nil)
|
||||
|
||||
// Update cluster node record with Olric role
|
||||
cm.updateClusterNodeOlricStatus(internalCtx, cluster.ID, node.NodeID)
|
||||
}
|
||||
|
||||
cm.logEvent(internalCtx, cluster.ID, EventOlricJoined, "", "Olric cluster formed", nil)
|
||||
|
||||
// Step 5: Start Gateway instances
|
||||
// Build Olric server list for gateway config
|
||||
olricServers := make([]string, len(selectedNodes))
|
||||
for i, node := range selectedNodes {
|
||||
olricServers[i] = fmt.Sprintf("%s:%d", node.IPAddress, portBlocks[i].OlricHTTPPort)
|
||||
}
|
||||
|
||||
for i, node := range selectedNodes {
|
||||
ports := portBlocks[i]
|
||||
gatewayConfig := gateway.InstanceConfig{
|
||||
Namespace: cluster.NamespaceName,
|
||||
NodeID: node.NodeID,
|
||||
HTTPPort: ports.GatewayHTTPPort,
|
||||
BaseDomain: cm.baseDomain,
|
||||
RQLiteDSN: fmt.Sprintf("http://%s:%d", node.IPAddress, ports.RQLiteHTTPPort),
|
||||
OlricServers: olricServers,
|
||||
NodePeerID: node.NodeID, // Use node ID as peer ID
|
||||
DataDir: cm.baseDataDir,
|
||||
}
|
||||
|
||||
_, err := cm.gatewaySpawner.SpawnInstance(internalCtx, gatewayConfig)
|
||||
if err != nil {
|
||||
cm.failCluster(internalCtx, cluster.ID, fmt.Sprintf("Failed to start Gateway on node %s: %v", node.NodeID, err))
|
||||
cm.cleanupOnFailure(internalCtx, cluster.ID, selectedNodes, portBlocks)
|
||||
return
|
||||
}
|
||||
cm.logEvent(internalCtx, cluster.ID, EventGatewayStarted, node.NodeID, "Gateway instance started", nil)
|
||||
|
||||
// Update cluster node record with Gateway role
|
||||
cm.updateClusterNodeGatewayStatus(internalCtx, cluster.ID, node.NodeID)
|
||||
}
|
||||
|
||||
// Step 6: Create DNS records for namespace gateway
|
||||
nodeIPs := make([]string, len(selectedNodes))
|
||||
for i, node := range selectedNodes {
|
||||
nodeIPs[i] = node.IPAddress
|
||||
}
|
||||
|
||||
if err := cm.dnsManager.CreateNamespaceRecords(internalCtx, cluster.NamespaceName, nodeIPs); err != nil {
|
||||
cm.failCluster(internalCtx, cluster.ID, fmt.Sprintf("Failed to create DNS records: %v", err))
|
||||
cm.cleanupOnFailure(internalCtx, cluster.ID, selectedNodes, portBlocks)
|
||||
return
|
||||
}
|
||||
cm.logEvent(internalCtx, cluster.ID, EventDNSCreated, "", "DNS records created", map[string]interface{}{
|
||||
"domain": fmt.Sprintf("ns-%s.%s", cluster.NamespaceName, cm.baseDomain),
|
||||
"node_ips": nodeIPs,
|
||||
})
|
||||
|
||||
// Mark cluster as ready
|
||||
now := time.Now()
|
||||
updateQuery := `UPDATE namespace_clusters SET status = ?, ready_at = ? WHERE id = ?`
|
||||
_, err = cm.db.Exec(internalCtx, updateQuery, string(ClusterStatusReady), now, cluster.ID)
|
||||
if err != nil {
|
||||
cm.logger.Error("Failed to update cluster status to ready",
|
||||
zap.String("cluster_id", cluster.ID),
|
||||
zap.Error(err),
|
||||
)
|
||||
}
|
||||
|
||||
cm.logEvent(internalCtx, cluster.ID, EventClusterReady, "", "Cluster is ready", nil)
|
||||
|
||||
cm.logger.Info("Cluster provisioning completed",
|
||||
zap.String("cluster_id", cluster.ID),
|
||||
zap.String("namespace", cluster.NamespaceName),
|
||||
)
|
||||
}
|
||||
|
||||
// DeprovisionCluster tears down all services for a namespace cluster
|
||||
func (cm *ClusterManager) DeprovisionCluster(ctx context.Context, clusterID string) error {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
// Get cluster info
|
||||
cluster, err := cm.GetCluster(ctx, clusterID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
cm.logger.Info("Starting cluster deprovisioning",
|
||||
zap.String("cluster_id", clusterID),
|
||||
zap.String("namespace", cluster.NamespaceName),
|
||||
)
|
||||
|
||||
// Update status to deprovisioning
|
||||
updateQuery := `UPDATE namespace_clusters SET status = ? WHERE id = ?`
|
||||
_, _ = cm.db.Exec(internalCtx, updateQuery, string(ClusterStatusDeprovisioning), clusterID)
|
||||
cm.logEvent(internalCtx, clusterID, EventDeprovisionStarted, "", "Cluster deprovisioning started", nil)
|
||||
|
||||
// Stop all gateway instances
|
||||
if err := cm.gatewaySpawner.StopAllInstances(ctx, cluster.NamespaceName); err != nil {
|
||||
cm.logger.Warn("Error stopping gateway instances", zap.Error(err))
|
||||
}
|
||||
|
||||
// Stop all olric instances
|
||||
if err := cm.olricSpawner.StopAllInstances(ctx, cluster.NamespaceName); err != nil {
|
||||
cm.logger.Warn("Error stopping olric instances", zap.Error(err))
|
||||
}
|
||||
|
||||
// Stop all rqlite instances
|
||||
if err := cm.rqliteSpawner.StopAllInstances(ctx, cluster.NamespaceName); err != nil {
|
||||
cm.logger.Warn("Error stopping rqlite instances", zap.Error(err))
|
||||
}
|
||||
|
||||
// Delete DNS records
|
||||
if err := cm.dnsManager.DeleteNamespaceRecords(ctx, cluster.NamespaceName); err != nil {
|
||||
cm.logger.Warn("Error deleting DNS records", zap.Error(err))
|
||||
}
|
||||
|
||||
// Deallocate all ports
|
||||
if err := cm.portAllocator.DeallocateAllPortBlocks(ctx, clusterID); err != nil {
|
||||
cm.logger.Warn("Error deallocating ports", zap.Error(err))
|
||||
}
|
||||
|
||||
// Delete cluster node records
|
||||
deleteNodesQuery := `DELETE FROM namespace_cluster_nodes WHERE namespace_cluster_id = ?`
|
||||
_, _ = cm.db.Exec(internalCtx, deleteNodesQuery, clusterID)
|
||||
|
||||
// Delete cluster record
|
||||
deleteClusterQuery := `DELETE FROM namespace_clusters WHERE id = ?`
|
||||
_, err = cm.db.Exec(internalCtx, deleteClusterQuery, clusterID)
|
||||
if err != nil {
|
||||
return &ClusterError{
|
||||
Message: "failed to delete cluster record",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
cm.logEvent(internalCtx, clusterID, EventDeprovisioned, "", "Cluster deprovisioned", nil)
|
||||
|
||||
cm.logger.Info("Cluster deprovisioning completed",
|
||||
zap.String("cluster_id", clusterID),
|
||||
zap.String("namespace", cluster.NamespaceName),
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetCluster retrieves a cluster by ID
|
||||
func (cm *ClusterManager) GetCluster(ctx context.Context, clusterID string) (*NamespaceCluster, error) {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
var clusters []NamespaceCluster
|
||||
query := `SELECT * FROM namespace_clusters WHERE id = ? LIMIT 1`
|
||||
err := cm.db.Query(internalCtx, &clusters, query, clusterID)
|
||||
if err != nil {
|
||||
return nil, &ClusterError{
|
||||
Message: "failed to query cluster",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
if len(clusters) == 0 {
|
||||
return nil, ErrClusterNotFound
|
||||
}
|
||||
|
||||
return &clusters[0], nil
|
||||
}
|
||||
|
||||
// GetClusterByNamespaceID retrieves a cluster by namespace ID
|
||||
func (cm *ClusterManager) GetClusterByNamespaceID(ctx context.Context, namespaceID int) (*NamespaceCluster, error) {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
var clusters []NamespaceCluster
|
||||
query := `SELECT * FROM namespace_clusters WHERE namespace_id = ? LIMIT 1`
|
||||
err := cm.db.Query(internalCtx, &clusters, query, namespaceID)
|
||||
if err != nil {
|
||||
return nil, &ClusterError{
|
||||
Message: "failed to query cluster",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
if len(clusters) == 0 {
|
||||
return nil, ErrClusterNotFound
|
||||
}
|
||||
|
||||
return &clusters[0], nil
|
||||
}
|
||||
|
||||
// GetClusterByNamespaceName retrieves a cluster by namespace name
|
||||
func (cm *ClusterManager) GetClusterByNamespaceName(ctx context.Context, namespaceName string) (*NamespaceCluster, error) {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
var clusters []NamespaceCluster
|
||||
query := `SELECT * FROM namespace_clusters WHERE namespace_name = ? LIMIT 1`
|
||||
err := cm.db.Query(internalCtx, &clusters, query, namespaceName)
|
||||
if err != nil {
|
||||
return nil, &ClusterError{
|
||||
Message: "failed to query cluster",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
if len(clusters) == 0 {
|
||||
return nil, ErrClusterNotFound
|
||||
}
|
||||
|
||||
return &clusters[0], nil
|
||||
}
|
||||
|
||||
// GetClusterStatus returns the detailed provisioning status of a cluster
|
||||
func (cm *ClusterManager) GetClusterStatus(ctx context.Context, clusterID string) (*ClusterProvisioningStatus, error) {
|
||||
cluster, err := cm.GetCluster(ctx, clusterID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Get cluster nodes
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
var nodes []ClusterNode
|
||||
nodesQuery := `SELECT * FROM namespace_cluster_nodes WHERE namespace_cluster_id = ?`
|
||||
_ = cm.db.Query(internalCtx, &nodes, nodesQuery, clusterID)
|
||||
|
||||
// Determine component readiness
|
||||
rqliteReady := false
|
||||
olricReady := false
|
||||
gatewayReady := false
|
||||
|
||||
rqliteCount := 0
|
||||
olricCount := 0
|
||||
gatewayCount := 0
|
||||
|
||||
for _, node := range nodes {
|
||||
if node.Status == NodeStatusRunning {
|
||||
switch node.Role {
|
||||
case NodeRoleRQLiteLeader, NodeRoleRQLiteFollower:
|
||||
rqliteCount++
|
||||
case NodeRoleOlric:
|
||||
olricCount++
|
||||
case NodeRoleGateway:
|
||||
gatewayCount++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Consider ready if we have the expected number of each type
|
||||
rqliteReady = rqliteCount >= cluster.RQLiteNodeCount
|
||||
olricReady = olricCount >= cluster.OlricNodeCount
|
||||
gatewayReady = gatewayCount >= cluster.GatewayNodeCount
|
||||
|
||||
// DNS is ready if cluster status is ready
|
||||
dnsReady := cluster.Status == ClusterStatusReady
|
||||
|
||||
nodeIDs := make([]string, len(nodes))
|
||||
for i, n := range nodes {
|
||||
nodeIDs[i] = n.NodeID
|
||||
}
|
||||
|
||||
status := &ClusterProvisioningStatus{
|
||||
ClusterID: cluster.ID,
|
||||
Namespace: cluster.NamespaceName,
|
||||
Status: cluster.Status,
|
||||
Nodes: nodeIDs,
|
||||
RQLiteReady: rqliteReady,
|
||||
OlricReady: olricReady,
|
||||
GatewayReady: gatewayReady,
|
||||
DNSReady: dnsReady,
|
||||
Error: cluster.ErrorMessage,
|
||||
CreatedAt: cluster.ProvisionedAt,
|
||||
ReadyAt: cluster.ReadyAt,
|
||||
}
|
||||
|
||||
return status, nil
|
||||
}
|
||||
|
||||
// failCluster marks a cluster as failed with an error message
|
||||
func (cm *ClusterManager) failCluster(ctx context.Context, clusterID, errorMsg string) {
|
||||
cm.logger.Error("Cluster provisioning failed",
|
||||
zap.String("cluster_id", clusterID),
|
||||
zap.String("error", errorMsg),
|
||||
)
|
||||
|
||||
updateQuery := `UPDATE namespace_clusters SET status = ?, error_message = ?, retry_count = retry_count + 1 WHERE id = ?`
|
||||
_, _ = cm.db.Exec(ctx, updateQuery, string(ClusterStatusFailed), errorMsg, clusterID)
|
||||
|
||||
cm.logEvent(ctx, clusterID, EventClusterFailed, "", errorMsg, nil)
|
||||
}
|
||||
|
||||
// cleanupOnFailure cleans up partial resources after a provisioning failure
|
||||
func (cm *ClusterManager) cleanupOnFailure(ctx context.Context, clusterID string, nodes []NodeCapacity, portBlocks []*PortBlock) {
|
||||
// Get namespace name from first port block
|
||||
var namespaceName string
|
||||
if len(portBlocks) > 0 {
|
||||
// Query to get namespace name from cluster
|
||||
var clusters []NamespaceCluster
|
||||
query := `SELECT namespace_name FROM namespace_clusters WHERE id = ? LIMIT 1`
|
||||
if err := cm.db.Query(ctx, &clusters, query, clusterID); err == nil && len(clusters) > 0 {
|
||||
namespaceName = clusters[0].NamespaceName
|
||||
}
|
||||
}
|
||||
|
||||
if namespaceName != "" {
|
||||
// Stop any started instances
|
||||
_ = cm.gatewaySpawner.StopAllInstances(ctx, namespaceName)
|
||||
_ = cm.olricSpawner.StopAllInstances(ctx, namespaceName)
|
||||
_ = cm.rqliteSpawner.StopAllInstances(ctx, namespaceName)
|
||||
}
|
||||
|
||||
// Deallocate ports
|
||||
for _, node := range nodes {
|
||||
_ = cm.portAllocator.DeallocatePortBlock(ctx, clusterID, node.NodeID)
|
||||
}
|
||||
|
||||
// Delete cluster node records
|
||||
deleteQuery := `DELETE FROM namespace_cluster_nodes WHERE namespace_cluster_id = ?`
|
||||
_, _ = cm.db.Exec(ctx, deleteQuery, clusterID)
|
||||
}
|
||||
|
||||
// logEvent logs a cluster lifecycle event
|
||||
func (cm *ClusterManager) logEvent(ctx context.Context, clusterID string, eventType EventType, nodeID, message string, metadata map[string]interface{}) {
|
||||
eventID := uuid.New().String()
|
||||
|
||||
var metadataJSON string
|
||||
if metadata != nil {
|
||||
data, _ := json.Marshal(metadata)
|
||||
metadataJSON = string(data)
|
||||
}
|
||||
|
||||
insertQuery := `
|
||||
INSERT INTO namespace_cluster_events (id, namespace_cluster_id, event_type, node_id, message, metadata, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
`
|
||||
_, _ = cm.db.Exec(ctx, insertQuery, eventID, clusterID, string(eventType), nodeID, message, metadataJSON, time.Now())
|
||||
|
||||
cm.logger.Debug("Cluster event logged",
|
||||
zap.String("cluster_id", clusterID),
|
||||
zap.String("event_type", string(eventType)),
|
||||
zap.String("node_id", nodeID),
|
||||
zap.String("message", message),
|
||||
)
|
||||
}
|
||||
|
||||
// createClusterNodeRecord creates a record for a node in the cluster
|
||||
func (cm *ClusterManager) createClusterNodeRecord(ctx context.Context, clusterID, nodeID string, role NodeRole, ports *PortBlock, pid int) {
|
||||
recordID := uuid.New().String()
|
||||
now := time.Now()
|
||||
|
||||
insertQuery := `
|
||||
INSERT INTO namespace_cluster_nodes (
|
||||
id, namespace_cluster_id, node_id, role,
|
||||
rqlite_http_port, rqlite_raft_port, olric_http_port, olric_memberlist_port, gateway_http_port,
|
||||
status, process_pid, created_at, updated_at
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
`
|
||||
_, _ = cm.db.Exec(ctx, insertQuery,
|
||||
recordID,
|
||||
clusterID,
|
||||
nodeID,
|
||||
string(role),
|
||||
ports.RQLiteHTTPPort,
|
||||
ports.RQLiteRaftPort,
|
||||
ports.OlricHTTPPort,
|
||||
ports.OlricMemberlistPort,
|
||||
ports.GatewayHTTPPort,
|
||||
string(NodeStatusRunning),
|
||||
pid,
|
||||
now,
|
||||
now,
|
||||
)
|
||||
}
|
||||
|
||||
// updateClusterNodeOlricStatus updates a node record to indicate Olric is running
|
||||
func (cm *ClusterManager) updateClusterNodeOlricStatus(ctx context.Context, clusterID, nodeID string) {
|
||||
// Check if Olric role record exists
|
||||
var existing []ClusterNode
|
||||
checkQuery := `SELECT id FROM namespace_cluster_nodes WHERE namespace_cluster_id = ? AND node_id = ? AND role = ?`
|
||||
_ = cm.db.Query(ctx, &existing, checkQuery, clusterID, nodeID, string(NodeRoleOlric))
|
||||
|
||||
if len(existing) == 0 {
|
||||
// Create new record for Olric role
|
||||
recordID := uuid.New().String()
|
||||
now := time.Now()
|
||||
insertQuery := `
|
||||
INSERT INTO namespace_cluster_nodes (
|
||||
id, namespace_cluster_id, node_id, role, status, created_at, updated_at
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
`
|
||||
_, _ = cm.db.Exec(ctx, insertQuery, recordID, clusterID, nodeID, string(NodeRoleOlric), string(NodeStatusRunning), now, now)
|
||||
}
|
||||
}
|
||||
|
||||
// updateClusterNodeGatewayStatus updates a node record to indicate Gateway is running
|
||||
func (cm *ClusterManager) updateClusterNodeGatewayStatus(ctx context.Context, clusterID, nodeID string) {
|
||||
// Check if Gateway role record exists
|
||||
var existing []ClusterNode
|
||||
checkQuery := `SELECT id FROM namespace_cluster_nodes WHERE namespace_cluster_id = ? AND node_id = ? AND role = ?`
|
||||
_ = cm.db.Query(ctx, &existing, checkQuery, clusterID, nodeID, string(NodeRoleGateway))
|
||||
|
||||
if len(existing) == 0 {
|
||||
// Create new record for Gateway role
|
||||
recordID := uuid.New().String()
|
||||
now := time.Now()
|
||||
insertQuery := `
|
||||
INSERT INTO namespace_cluster_nodes (
|
||||
id, namespace_cluster_id, node_id, role, status, created_at, updated_at
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
`
|
||||
_, _ = cm.db.Exec(ctx, insertQuery, recordID, clusterID, nodeID, string(NodeRoleGateway), string(NodeStatusRunning), now, now)
|
||||
}
|
||||
}
|
||||
395
pkg/namespace/cluster_manager_test.go
Normal file
395
pkg/namespace/cluster_manager_test.go
Normal file
@ -0,0 +1,395 @@
|
||||
package namespace
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
func TestClusterManagerConfig(t *testing.T) {
|
||||
cfg := ClusterManagerConfig{
|
||||
BaseDomain: "devnet-orama.network",
|
||||
BaseDataDir: "~/.orama/data/namespaces",
|
||||
}
|
||||
|
||||
if cfg.BaseDomain != "devnet-orama.network" {
|
||||
t.Errorf("BaseDomain = %s, want devnet-orama.network", cfg.BaseDomain)
|
||||
}
|
||||
if cfg.BaseDataDir != "~/.orama/data/namespaces" {
|
||||
t.Errorf("BaseDataDir = %s, want ~/.orama/data/namespaces", cfg.BaseDataDir)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewClusterManager(t *testing.T) {
|
||||
mockDB := newMockRQLiteClient()
|
||||
logger := zap.NewNop()
|
||||
cfg := ClusterManagerConfig{
|
||||
BaseDomain: "devnet-orama.network",
|
||||
BaseDataDir: "/tmp/test-namespaces",
|
||||
}
|
||||
|
||||
manager := NewClusterManager(mockDB, cfg, logger)
|
||||
|
||||
if manager == nil {
|
||||
t.Fatal("NewClusterManager returned nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNamespaceCluster_InitialState(t *testing.T) {
|
||||
now := time.Now()
|
||||
|
||||
cluster := &NamespaceCluster{
|
||||
ID: "test-cluster-id",
|
||||
NamespaceID: 1,
|
||||
NamespaceName: "test-namespace",
|
||||
Status: ClusterStatusProvisioning,
|
||||
RQLiteNodeCount: DefaultRQLiteNodeCount,
|
||||
OlricNodeCount: DefaultOlricNodeCount,
|
||||
GatewayNodeCount: DefaultGatewayNodeCount,
|
||||
ProvisionedBy: "test-user",
|
||||
ProvisionedAt: now,
|
||||
ReadyAt: nil,
|
||||
ErrorMessage: "",
|
||||
RetryCount: 0,
|
||||
}
|
||||
|
||||
// Verify initial state
|
||||
if cluster.Status != ClusterStatusProvisioning {
|
||||
t.Errorf("Initial status = %s, want %s", cluster.Status, ClusterStatusProvisioning)
|
||||
}
|
||||
if cluster.ReadyAt != nil {
|
||||
t.Error("ReadyAt should be nil initially")
|
||||
}
|
||||
if cluster.ErrorMessage != "" {
|
||||
t.Errorf("ErrorMessage should be empty initially, got %s", cluster.ErrorMessage)
|
||||
}
|
||||
if cluster.RetryCount != 0 {
|
||||
t.Errorf("RetryCount should be 0 initially, got %d", cluster.RetryCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNamespaceCluster_DefaultNodeCounts(t *testing.T) {
|
||||
cluster := &NamespaceCluster{
|
||||
RQLiteNodeCount: DefaultRQLiteNodeCount,
|
||||
OlricNodeCount: DefaultOlricNodeCount,
|
||||
GatewayNodeCount: DefaultGatewayNodeCount,
|
||||
}
|
||||
|
||||
if cluster.RQLiteNodeCount != 3 {
|
||||
t.Errorf("RQLiteNodeCount = %d, want 3", cluster.RQLiteNodeCount)
|
||||
}
|
||||
if cluster.OlricNodeCount != 3 {
|
||||
t.Errorf("OlricNodeCount = %d, want 3", cluster.OlricNodeCount)
|
||||
}
|
||||
if cluster.GatewayNodeCount != 3 {
|
||||
t.Errorf("GatewayNodeCount = %d, want 3", cluster.GatewayNodeCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterProvisioningStatus_ReadinessFlags(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
rqliteReady bool
|
||||
olricReady bool
|
||||
gatewayReady bool
|
||||
dnsReady bool
|
||||
expectedAll bool
|
||||
}{
|
||||
{"All ready", true, true, true, true, true},
|
||||
{"RQLite not ready", false, true, true, true, false},
|
||||
{"Olric not ready", true, false, true, true, false},
|
||||
{"Gateway not ready", true, true, false, true, false},
|
||||
{"DNS not ready", true, true, true, false, false},
|
||||
{"None ready", false, false, false, false, false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
status := &ClusterProvisioningStatus{
|
||||
RQLiteReady: tt.rqliteReady,
|
||||
OlricReady: tt.olricReady,
|
||||
GatewayReady: tt.gatewayReady,
|
||||
DNSReady: tt.dnsReady,
|
||||
}
|
||||
|
||||
allReady := status.RQLiteReady && status.OlricReady && status.GatewayReady && status.DNSReady
|
||||
if allReady != tt.expectedAll {
|
||||
t.Errorf("All ready = %v, want %v", allReady, tt.expectedAll)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterStatusTransitions(t *testing.T) {
|
||||
// Test valid status transitions
|
||||
validTransitions := map[ClusterStatus][]ClusterStatus{
|
||||
ClusterStatusNone: {ClusterStatusProvisioning},
|
||||
ClusterStatusProvisioning: {ClusterStatusReady, ClusterStatusFailed},
|
||||
ClusterStatusReady: {ClusterStatusDegraded, ClusterStatusDeprovisioning},
|
||||
ClusterStatusDegraded: {ClusterStatusReady, ClusterStatusFailed, ClusterStatusDeprovisioning},
|
||||
ClusterStatusFailed: {ClusterStatusProvisioning, ClusterStatusDeprovisioning}, // Retry or delete
|
||||
ClusterStatusDeprovisioning: {ClusterStatusNone},
|
||||
}
|
||||
|
||||
for from, toList := range validTransitions {
|
||||
for _, to := range toList {
|
||||
t.Run(string(from)+"->"+string(to), func(t *testing.T) {
|
||||
// This is a documentation test - it verifies the expected transitions
|
||||
// The actual enforcement would be in the ClusterManager methods
|
||||
if from == to && from != ClusterStatusNone {
|
||||
t.Errorf("Status should not transition to itself: %s -> %s", from, to)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterNode_RoleAssignment(t *testing.T) {
|
||||
// Test that a node can have multiple roles
|
||||
roles := []NodeRole{
|
||||
NodeRoleRQLiteLeader,
|
||||
NodeRoleRQLiteFollower,
|
||||
NodeRoleOlric,
|
||||
NodeRoleGateway,
|
||||
}
|
||||
|
||||
// In the implementation, each node hosts all three services
|
||||
// but we track them as separate role records
|
||||
expectedRolesPerNode := 3 // RQLite (leader OR follower), Olric, Gateway
|
||||
|
||||
// For a 3-node cluster
|
||||
nodesCount := 3
|
||||
totalRoleRecords := nodesCount * expectedRolesPerNode
|
||||
|
||||
if totalRoleRecords != 9 {
|
||||
t.Errorf("Expected 9 role records for 3 nodes, got %d", totalRoleRecords)
|
||||
}
|
||||
|
||||
// Verify all roles are represented
|
||||
if len(roles) != 4 {
|
||||
t.Errorf("Expected 4 role types, got %d", len(roles))
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterEvent_LifecycleEvents(t *testing.T) {
|
||||
// Test all lifecycle events are properly ordered
|
||||
lifecycleOrder := []EventType{
|
||||
EventProvisioningStarted,
|
||||
EventNodesSelected,
|
||||
EventPortsAllocated,
|
||||
EventRQLiteStarted,
|
||||
EventRQLiteJoined,
|
||||
EventRQLiteLeaderElected,
|
||||
EventOlricStarted,
|
||||
EventOlricJoined,
|
||||
EventGatewayStarted,
|
||||
EventDNSCreated,
|
||||
EventClusterReady,
|
||||
}
|
||||
|
||||
// Verify we have all the events
|
||||
if len(lifecycleOrder) != 11 {
|
||||
t.Errorf("Expected 11 lifecycle events, got %d", len(lifecycleOrder))
|
||||
}
|
||||
|
||||
// Verify they're all unique
|
||||
seen := make(map[EventType]bool)
|
||||
for _, event := range lifecycleOrder {
|
||||
if seen[event] {
|
||||
t.Errorf("Duplicate event type: %s", event)
|
||||
}
|
||||
seen[event] = true
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterEvent_FailureEvents(t *testing.T) {
|
||||
failureEvents := []EventType{
|
||||
EventClusterDegraded,
|
||||
EventClusterFailed,
|
||||
EventNodeFailed,
|
||||
}
|
||||
|
||||
for _, event := range failureEvents {
|
||||
t.Run(string(event), func(t *testing.T) {
|
||||
if event == "" {
|
||||
t.Error("Event type should not be empty")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterEvent_RecoveryEvents(t *testing.T) {
|
||||
recoveryEvents := []EventType{
|
||||
EventNodeRecovered,
|
||||
}
|
||||
|
||||
for _, event := range recoveryEvents {
|
||||
t.Run(string(event), func(t *testing.T) {
|
||||
if event == "" {
|
||||
t.Error("Event type should not be empty")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterEvent_DeprovisioningEvents(t *testing.T) {
|
||||
deprovisionEvents := []EventType{
|
||||
EventDeprovisionStarted,
|
||||
EventDeprovisioned,
|
||||
}
|
||||
|
||||
for _, event := range deprovisionEvents {
|
||||
t.Run(string(event), func(t *testing.T) {
|
||||
if event == "" {
|
||||
t.Error("Event type should not be empty")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestProvisioningResponse_PollURL(t *testing.T) {
|
||||
clusterID := "test-cluster-123"
|
||||
expectedPollURL := "/v1/namespace/status?id=test-cluster-123"
|
||||
|
||||
pollURL := "/v1/namespace/status?id=" + clusterID
|
||||
if pollURL != expectedPollURL {
|
||||
t.Errorf("PollURL = %s, want %s", pollURL, expectedPollURL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterManager_PortAllocationOrder(t *testing.T) {
|
||||
// Verify the order of port assignments within a block
|
||||
portStart := 10000
|
||||
|
||||
rqliteHTTP := portStart + 0
|
||||
rqliteRaft := portStart + 1
|
||||
olricHTTP := portStart + 2
|
||||
olricMemberlist := portStart + 3
|
||||
gatewayHTTP := portStart + 4
|
||||
|
||||
// Verify order
|
||||
if rqliteHTTP != 10000 {
|
||||
t.Errorf("RQLite HTTP port = %d, want 10000", rqliteHTTP)
|
||||
}
|
||||
if rqliteRaft != 10001 {
|
||||
t.Errorf("RQLite Raft port = %d, want 10001", rqliteRaft)
|
||||
}
|
||||
if olricHTTP != 10002 {
|
||||
t.Errorf("Olric HTTP port = %d, want 10002", olricHTTP)
|
||||
}
|
||||
if olricMemberlist != 10003 {
|
||||
t.Errorf("Olric Memberlist port = %d, want 10003", olricMemberlist)
|
||||
}
|
||||
if gatewayHTTP != 10004 {
|
||||
t.Errorf("Gateway HTTP port = %d, want 10004", gatewayHTTP)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterManager_DNSFormat(t *testing.T) {
|
||||
// Test the DNS domain format for namespace gateways
|
||||
baseDomain := "devnet-orama.network"
|
||||
namespaceName := "alice"
|
||||
|
||||
expectedDomain := "ns-alice.devnet-orama.network"
|
||||
actualDomain := "ns-" + namespaceName + "." + baseDomain
|
||||
|
||||
if actualDomain != expectedDomain {
|
||||
t.Errorf("DNS domain = %s, want %s", actualDomain, expectedDomain)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterManager_RQLiteAddresses(t *testing.T) {
|
||||
// Test RQLite advertised address format
|
||||
nodeIP := "192.168.1.100"
|
||||
|
||||
expectedHTTPAddr := "192.168.1.100:10000"
|
||||
expectedRaftAddr := "192.168.1.100:10001"
|
||||
|
||||
httpAddr := nodeIP + ":10000"
|
||||
raftAddr := nodeIP + ":10001"
|
||||
|
||||
if httpAddr != expectedHTTPAddr {
|
||||
t.Errorf("HTTP address = %s, want %s", httpAddr, expectedHTTPAddr)
|
||||
}
|
||||
if raftAddr != expectedRaftAddr {
|
||||
t.Errorf("Raft address = %s, want %s", raftAddr, expectedRaftAddr)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterManager_OlricPeerFormat(t *testing.T) {
|
||||
// Test Olric peer address format
|
||||
nodes := []struct {
|
||||
ip string
|
||||
port int
|
||||
}{
|
||||
{"192.168.1.100", 10003},
|
||||
{"192.168.1.101", 10003},
|
||||
{"192.168.1.102", 10003},
|
||||
}
|
||||
|
||||
peers := make([]string, len(nodes))
|
||||
for i, n := range nodes {
|
||||
peers[i] = n.ip + ":10003"
|
||||
}
|
||||
|
||||
expected := []string{
|
||||
"192.168.1.100:10003",
|
||||
"192.168.1.101:10003",
|
||||
"192.168.1.102:10003",
|
||||
}
|
||||
|
||||
for i, peer := range peers {
|
||||
if peer != expected[i] {
|
||||
t.Errorf("Peer[%d] = %s, want %s", i, peer, expected[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterManager_GatewayRQLiteDSN(t *testing.T) {
|
||||
// Test the RQLite DSN format used by gateways
|
||||
nodeIP := "192.168.1.100"
|
||||
|
||||
expectedDSN := "http://192.168.1.100:10000"
|
||||
actualDSN := "http://" + nodeIP + ":10000"
|
||||
|
||||
if actualDSN != expectedDSN {
|
||||
t.Errorf("RQLite DSN = %s, want %s", actualDSN, expectedDSN)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterManager_MinimumNodeRequirement(t *testing.T) {
|
||||
// A cluster requires at least 3 nodes
|
||||
minimumNodes := DefaultRQLiteNodeCount
|
||||
|
||||
if minimumNodes < 3 {
|
||||
t.Errorf("Minimum node count = %d, want at least 3 for fault tolerance", minimumNodes)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterManager_QuorumCalculation(t *testing.T) {
|
||||
// For RQLite Raft consensus, quorum = (n/2) + 1
|
||||
tests := []struct {
|
||||
nodes int
|
||||
expectedQuorum int
|
||||
canLoseNodes int
|
||||
}{
|
||||
{3, 2, 1}, // 3 nodes: quorum=2, can lose 1
|
||||
{5, 3, 2}, // 5 nodes: quorum=3, can lose 2
|
||||
{7, 4, 3}, // 7 nodes: quorum=4, can lose 3
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(string(rune(tt.nodes+'0'))+" nodes", func(t *testing.T) {
|
||||
quorum := (tt.nodes / 2) + 1
|
||||
if quorum != tt.expectedQuorum {
|
||||
t.Errorf("Quorum for %d nodes = %d, want %d", tt.nodes, quorum, tt.expectedQuorum)
|
||||
}
|
||||
|
||||
canLose := tt.nodes - quorum
|
||||
if canLose != tt.canLoseNodes {
|
||||
t.Errorf("Can lose %d nodes, want %d", canLose, tt.canLoseNodes)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
251
pkg/namespace/dns_manager.go
Normal file
251
pkg/namespace/dns_manager.go
Normal file
@ -0,0 +1,251 @@
|
||||
package namespace
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/client"
|
||||
"github.com/DeBrosOfficial/network/pkg/rqlite"
|
||||
"github.com/google/uuid"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// DNSRecordManager manages DNS records for namespace clusters.
|
||||
// It creates and deletes DNS A records for namespace gateway endpoints.
|
||||
type DNSRecordManager struct {
|
||||
db rqlite.Client
|
||||
baseDomain string
|
||||
logger *zap.Logger
|
||||
}
|
||||
|
||||
// NewDNSRecordManager creates a new DNS record manager
|
||||
func NewDNSRecordManager(db rqlite.Client, baseDomain string, logger *zap.Logger) *DNSRecordManager {
|
||||
return &DNSRecordManager{
|
||||
db: db,
|
||||
baseDomain: baseDomain,
|
||||
logger: logger.With(zap.String("component", "dns-record-manager")),
|
||||
}
|
||||
}
|
||||
|
||||
// CreateNamespaceRecords creates DNS A records for a namespace cluster.
|
||||
// Each namespace gets records for ns-{namespace}.{baseDomain} pointing to its gateway nodes.
|
||||
// Multiple A records enable round-robin DNS load balancing.
|
||||
func (drm *DNSRecordManager) CreateNamespaceRecords(ctx context.Context, namespaceName string, nodeIPs []string) error {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
if len(nodeIPs) == 0 {
|
||||
return &ClusterError{Message: "no node IPs provided for DNS records"}
|
||||
}
|
||||
|
||||
// FQDN for namespace gateway: ns-{namespace}.{baseDomain}.
|
||||
fqdn := fmt.Sprintf("ns-%s.%s.", namespaceName, drm.baseDomain)
|
||||
|
||||
drm.logger.Info("Creating namespace DNS records",
|
||||
zap.String("namespace", namespaceName),
|
||||
zap.String("fqdn", fqdn),
|
||||
zap.Strings("node_ips", nodeIPs),
|
||||
)
|
||||
|
||||
// First, delete any existing records for this namespace
|
||||
deleteQuery := `DELETE FROM dns_records WHERE fqdn = ? AND namespace = ?`
|
||||
_, err := drm.db.Exec(internalCtx, deleteQuery, fqdn, "namespace:"+namespaceName)
|
||||
if err != nil {
|
||||
drm.logger.Warn("Failed to delete existing DNS records", zap.Error(err))
|
||||
// Continue anyway - the insert will just add more records
|
||||
}
|
||||
|
||||
// Create A records for each node IP
|
||||
for _, ip := range nodeIPs {
|
||||
recordID := uuid.New().String()
|
||||
insertQuery := `
|
||||
INSERT INTO dns_records (
|
||||
id, fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
`
|
||||
now := time.Now()
|
||||
_, err := drm.db.Exec(internalCtx, insertQuery,
|
||||
recordID,
|
||||
fqdn,
|
||||
"A",
|
||||
ip,
|
||||
60, // 60 second TTL for quick failover
|
||||
"namespace:"+namespaceName, // Track ownership with namespace prefix
|
||||
"cluster-manager", // Created by the cluster manager
|
||||
true, // Active
|
||||
now,
|
||||
now,
|
||||
)
|
||||
if err != nil {
|
||||
return &ClusterError{
|
||||
Message: fmt.Sprintf("failed to create DNS record for %s -> %s", fqdn, ip),
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Also create wildcard records for deployments under this namespace
|
||||
// *.ns-{namespace}.{baseDomain} -> same IPs
|
||||
wildcardFqdn := fmt.Sprintf("*.ns-%s.%s.", namespaceName, drm.baseDomain)
|
||||
|
||||
// Delete existing wildcard records
|
||||
_, _ = drm.db.Exec(internalCtx, deleteQuery, wildcardFqdn, "namespace:"+namespaceName)
|
||||
|
||||
for _, ip := range nodeIPs {
|
||||
recordID := uuid.New().String()
|
||||
insertQuery := `
|
||||
INSERT INTO dns_records (
|
||||
id, fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
`
|
||||
now := time.Now()
|
||||
_, err := drm.db.Exec(internalCtx, insertQuery,
|
||||
recordID,
|
||||
wildcardFqdn,
|
||||
"A",
|
||||
ip,
|
||||
60,
|
||||
"namespace:"+namespaceName,
|
||||
"cluster-manager",
|
||||
true,
|
||||
now,
|
||||
now,
|
||||
)
|
||||
if err != nil {
|
||||
drm.logger.Warn("Failed to create wildcard DNS record",
|
||||
zap.String("fqdn", wildcardFqdn),
|
||||
zap.String("ip", ip),
|
||||
zap.Error(err),
|
||||
)
|
||||
// Continue - wildcard is nice to have but not critical
|
||||
}
|
||||
}
|
||||
|
||||
drm.logger.Info("Namespace DNS records created",
|
||||
zap.String("namespace", namespaceName),
|
||||
zap.Int("record_count", len(nodeIPs)*2), // A + wildcard
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// DeleteNamespaceRecords deletes all DNS records for a namespace
|
||||
func (drm *DNSRecordManager) DeleteNamespaceRecords(ctx context.Context, namespaceName string) error {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
drm.logger.Info("Deleting namespace DNS records",
|
||||
zap.String("namespace", namespaceName),
|
||||
)
|
||||
|
||||
// Delete all records owned by this namespace
|
||||
deleteQuery := `DELETE FROM dns_records WHERE namespace = ?`
|
||||
_, err := drm.db.Exec(internalCtx, deleteQuery, "namespace:"+namespaceName)
|
||||
if err != nil {
|
||||
return &ClusterError{
|
||||
Message: "failed to delete namespace DNS records",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
drm.logger.Info("Namespace DNS records deleted",
|
||||
zap.String("namespace", namespaceName),
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetNamespaceGatewayIPs returns the IP addresses for a namespace's gateway
|
||||
func (drm *DNSRecordManager) GetNamespaceGatewayIPs(ctx context.Context, namespaceName string) ([]string, error) {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
fqdn := fmt.Sprintf("ns-%s.%s.", namespaceName, drm.baseDomain)
|
||||
|
||||
type recordRow struct {
|
||||
Value string `db:"value"`
|
||||
}
|
||||
|
||||
var records []recordRow
|
||||
query := `SELECT value FROM dns_records WHERE fqdn = ? AND record_type = 'A' AND is_active = TRUE`
|
||||
err := drm.db.Query(internalCtx, &records, query, fqdn)
|
||||
if err != nil {
|
||||
return nil, &ClusterError{
|
||||
Message: "failed to query namespace DNS records",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
ips := make([]string, len(records))
|
||||
for i, r := range records {
|
||||
ips[i] = r.Value
|
||||
}
|
||||
|
||||
return ips, nil
|
||||
}
|
||||
|
||||
// UpdateNamespaceRecord updates a specific node's DNS record (for failover)
|
||||
func (drm *DNSRecordManager) UpdateNamespaceRecord(ctx context.Context, namespaceName, oldIP, newIP string) error {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
fqdn := fmt.Sprintf("ns-%s.%s.", namespaceName, drm.baseDomain)
|
||||
wildcardFqdn := fmt.Sprintf("*.ns-%s.%s.", namespaceName, drm.baseDomain)
|
||||
|
||||
drm.logger.Info("Updating namespace DNS record",
|
||||
zap.String("namespace", namespaceName),
|
||||
zap.String("old_ip", oldIP),
|
||||
zap.String("new_ip", newIP),
|
||||
)
|
||||
|
||||
// Update both the main record and wildcard record
|
||||
for _, f := range []string{fqdn, wildcardFqdn} {
|
||||
updateQuery := `UPDATE dns_records SET value = ?, updated_at = ? WHERE fqdn = ? AND value = ?`
|
||||
_, err := drm.db.Exec(internalCtx, updateQuery, newIP, time.Now(), f, oldIP)
|
||||
if err != nil {
|
||||
drm.logger.Warn("Failed to update DNS record",
|
||||
zap.String("fqdn", f),
|
||||
zap.Error(err),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// DisableNamespaceRecord marks a specific IP's record as inactive (for temporary failover)
|
||||
func (drm *DNSRecordManager) DisableNamespaceRecord(ctx context.Context, namespaceName, ip string) error {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
fqdn := fmt.Sprintf("ns-%s.%s.", namespaceName, drm.baseDomain)
|
||||
wildcardFqdn := fmt.Sprintf("*.ns-%s.%s.", namespaceName, drm.baseDomain)
|
||||
|
||||
drm.logger.Info("Disabling namespace DNS record",
|
||||
zap.String("namespace", namespaceName),
|
||||
zap.String("ip", ip),
|
||||
)
|
||||
|
||||
for _, f := range []string{fqdn, wildcardFqdn} {
|
||||
updateQuery := `UPDATE dns_records SET is_active = FALSE, updated_at = ? WHERE fqdn = ? AND value = ?`
|
||||
_, _ = drm.db.Exec(internalCtx, updateQuery, time.Now(), f, ip)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// EnableNamespaceRecord marks a specific IP's record as active (for recovery)
|
||||
func (drm *DNSRecordManager) EnableNamespaceRecord(ctx context.Context, namespaceName, ip string) error {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
fqdn := fmt.Sprintf("ns-%s.%s.", namespaceName, drm.baseDomain)
|
||||
wildcardFqdn := fmt.Sprintf("*.ns-%s.%s.", namespaceName, drm.baseDomain)
|
||||
|
||||
drm.logger.Info("Enabling namespace DNS record",
|
||||
zap.String("namespace", namespaceName),
|
||||
zap.String("ip", ip),
|
||||
)
|
||||
|
||||
for _, f := range []string{fqdn, wildcardFqdn} {
|
||||
updateQuery := `UPDATE dns_records SET is_active = TRUE, updated_at = ? WHERE fqdn = ? AND value = ?`
|
||||
_, _ = drm.db.Exec(internalCtx, updateQuery, time.Now(), f, ip)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
217
pkg/namespace/dns_manager_test.go
Normal file
217
pkg/namespace/dns_manager_test.go
Normal file
@ -0,0 +1,217 @@
|
||||
package namespace
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
func TestDNSRecordManager_FQDNFormat(t *testing.T) {
|
||||
// Test that FQDN is correctly formatted
|
||||
tests := []struct {
|
||||
namespace string
|
||||
baseDomain string
|
||||
expected string
|
||||
}{
|
||||
{"alice", "devnet-orama.network", "ns-alice.devnet-orama.network."},
|
||||
{"bob", "testnet-orama.network", "ns-bob.testnet-orama.network."},
|
||||
{"my-namespace", "mainnet-orama.network", "ns-my-namespace.mainnet-orama.network."},
|
||||
{"test123", "example.com", "ns-test123.example.com."},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.namespace, func(t *testing.T) {
|
||||
fqdn := fmt.Sprintf("ns-%s.%s.", tt.namespace, tt.baseDomain)
|
||||
if fqdn != tt.expected {
|
||||
t.Errorf("FQDN = %s, want %s", fqdn, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestDNSRecordManager_WildcardFQDNFormat(t *testing.T) {
|
||||
// Test that wildcard FQDN is correctly formatted
|
||||
tests := []struct {
|
||||
namespace string
|
||||
baseDomain string
|
||||
expected string
|
||||
}{
|
||||
{"alice", "devnet-orama.network", "*.ns-alice.devnet-orama.network."},
|
||||
{"bob", "testnet-orama.network", "*.ns-bob.testnet-orama.network."},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.namespace, func(t *testing.T) {
|
||||
wildcardFqdn := fmt.Sprintf("*.ns-%s.%s.", tt.namespace, tt.baseDomain)
|
||||
if wildcardFqdn != tt.expected {
|
||||
t.Errorf("Wildcard FQDN = %s, want %s", wildcardFqdn, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewDNSRecordManager(t *testing.T) {
|
||||
mockDB := newMockRQLiteClient()
|
||||
logger := zap.NewNop()
|
||||
baseDomain := "devnet-orama.network"
|
||||
|
||||
manager := NewDNSRecordManager(mockDB, baseDomain, logger)
|
||||
|
||||
if manager == nil {
|
||||
t.Fatal("NewDNSRecordManager returned nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDNSRecordManager_NamespacePrefix(t *testing.T) {
|
||||
// Test the namespace prefix used for tracking ownership
|
||||
namespace := "my-namespace"
|
||||
expected := "namespace:my-namespace"
|
||||
|
||||
prefix := "namespace:" + namespace
|
||||
if prefix != expected {
|
||||
t.Errorf("Namespace prefix = %s, want %s", prefix, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDNSRecordTTL(t *testing.T) {
|
||||
// DNS records should have a 60-second TTL for quick failover
|
||||
expectedTTL := 60
|
||||
|
||||
// This is testing the constant used in the code
|
||||
ttl := 60
|
||||
if ttl != expectedTTL {
|
||||
t.Errorf("TTL = %d, want %d", ttl, expectedTTL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDNSRecordManager_MultipleDomainFormats(t *testing.T) {
|
||||
// Test support for different domain formats
|
||||
baseDomains := []string{
|
||||
"devnet-orama.network",
|
||||
"testnet-orama.network",
|
||||
"mainnet-orama.network",
|
||||
"custom.example.com",
|
||||
"subdomain.custom.example.com",
|
||||
}
|
||||
|
||||
for _, baseDomain := range baseDomains {
|
||||
t.Run(baseDomain, func(t *testing.T) {
|
||||
namespace := "test"
|
||||
fqdn := fmt.Sprintf("ns-%s.%s.", namespace, baseDomain)
|
||||
|
||||
// Verify FQDN ends with trailing dot
|
||||
if fqdn[len(fqdn)-1] != '.' {
|
||||
t.Errorf("FQDN should end with trailing dot: %s", fqdn)
|
||||
}
|
||||
|
||||
// Verify format is correct
|
||||
expectedPrefix := "ns-test."
|
||||
if len(fqdn) <= len(expectedPrefix) {
|
||||
t.Errorf("FQDN too short: %s", fqdn)
|
||||
}
|
||||
if fqdn[:len(expectedPrefix)] != expectedPrefix {
|
||||
t.Errorf("FQDN should start with %s: %s", expectedPrefix, fqdn)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestDNSRecordManager_IPValidation(t *testing.T) {
|
||||
// Test IP address formats that should be accepted
|
||||
validIPs := []string{
|
||||
"192.168.1.1",
|
||||
"10.0.0.1",
|
||||
"172.16.0.1",
|
||||
"1.2.3.4",
|
||||
"255.255.255.255",
|
||||
}
|
||||
|
||||
for _, ip := range validIPs {
|
||||
t.Run(ip, func(t *testing.T) {
|
||||
// Basic validation: IP should not be empty
|
||||
if ip == "" {
|
||||
t.Error("IP should not be empty")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestDNSRecordManager_EmptyNodeIPs(t *testing.T) {
|
||||
// Creating records with empty node IPs should be an error
|
||||
nodeIPs := []string{}
|
||||
|
||||
if len(nodeIPs) == 0 {
|
||||
// This condition should trigger the error in CreateNamespaceRecords
|
||||
err := &ClusterError{Message: "no node IPs provided for DNS records"}
|
||||
if err.Message != "no node IPs provided for DNS records" {
|
||||
t.Error("Expected error message for empty IPs")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDNSRecordManager_RecordTypes(t *testing.T) {
|
||||
// DNS records for namespace gateways should be A records
|
||||
expectedRecordType := "A"
|
||||
|
||||
recordType := "A"
|
||||
if recordType != expectedRecordType {
|
||||
t.Errorf("Record type = %s, want %s", recordType, expectedRecordType)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDNSRecordManager_CreatedByField(t *testing.T) {
|
||||
// Records should be created by "cluster-manager"
|
||||
expected := "cluster-manager"
|
||||
|
||||
createdBy := "cluster-manager"
|
||||
if createdBy != expected {
|
||||
t.Errorf("CreatedBy = %s, want %s", createdBy, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDNSRecordManager_RoundRobinConcept(t *testing.T) {
|
||||
// Test that multiple A records for the same FQDN enable round-robin
|
||||
nodeIPs := []string{
|
||||
"192.168.1.100",
|
||||
"192.168.1.101",
|
||||
"192.168.1.102",
|
||||
}
|
||||
|
||||
// For round-robin DNS, we need one A record per IP
|
||||
expectedRecordCount := len(nodeIPs)
|
||||
|
||||
if expectedRecordCount != 3 {
|
||||
t.Errorf("Expected %d A records for round-robin, got %d", 3, expectedRecordCount)
|
||||
}
|
||||
|
||||
// Each IP should be unique
|
||||
seen := make(map[string]bool)
|
||||
for _, ip := range nodeIPs {
|
||||
if seen[ip] {
|
||||
t.Errorf("Duplicate IP in node list: %s", ip)
|
||||
}
|
||||
seen[ip] = true
|
||||
}
|
||||
}
|
||||
|
||||
func TestDNSRecordManager_FQDNWithTrailingDot(t *testing.T) {
|
||||
// DNS FQDNs should always end with a trailing dot
|
||||
// This is important for proper DNS resolution
|
||||
|
||||
tests := []struct {
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
{"ns-alice.devnet-orama.network", "ns-alice.devnet-orama.network."},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.input, func(t *testing.T) {
|
||||
fqdn := tt.input + "."
|
||||
if fqdn != tt.expected {
|
||||
t.Errorf("FQDN = %s, want %s", fqdn, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
382
pkg/namespace/node_selector.go
Normal file
382
pkg/namespace/node_selector.go
Normal file
@ -0,0 +1,382 @@
|
||||
package namespace
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/client"
|
||||
"github.com/DeBrosOfficial/network/pkg/rqlite"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// ClusterNodeSelector selects optimal nodes for namespace clusters.
|
||||
// It extends the existing capacity scoring system from deployments/home_node.go
|
||||
// to select multiple nodes based on available capacity.
|
||||
type ClusterNodeSelector struct {
|
||||
db rqlite.Client
|
||||
portAllocator *NamespacePortAllocator
|
||||
logger *zap.Logger
|
||||
}
|
||||
|
||||
// NodeCapacity represents the capacity metrics for a single node
|
||||
type NodeCapacity struct {
|
||||
NodeID string `json:"node_id"`
|
||||
IPAddress string `json:"ip_address"`
|
||||
DeploymentCount int `json:"deployment_count"`
|
||||
AllocatedPorts int `json:"allocated_ports"`
|
||||
AvailablePorts int `json:"available_ports"`
|
||||
UsedMemoryMB int `json:"used_memory_mb"`
|
||||
AvailableMemoryMB int `json:"available_memory_mb"`
|
||||
UsedCPUPercent int `json:"used_cpu_percent"`
|
||||
NamespaceInstanceCount int `json:"namespace_instance_count"` // Number of namespace clusters on this node
|
||||
AvailableNamespaceSlots int `json:"available_namespace_slots"` // How many more namespace instances can fit
|
||||
Score float64 `json:"score"`
|
||||
}
|
||||
|
||||
// NewClusterNodeSelector creates a new node selector
|
||||
func NewClusterNodeSelector(db rqlite.Client, portAllocator *NamespacePortAllocator, logger *zap.Logger) *ClusterNodeSelector {
|
||||
return &ClusterNodeSelector{
|
||||
db: db,
|
||||
portAllocator: portAllocator,
|
||||
logger: logger.With(zap.String("component", "cluster-node-selector")),
|
||||
}
|
||||
}
|
||||
|
||||
// SelectNodesForCluster selects the optimal N nodes for a new namespace cluster.
|
||||
// Returns the node IDs sorted by score (best first).
|
||||
func (cns *ClusterNodeSelector) SelectNodesForCluster(ctx context.Context, nodeCount int) ([]NodeCapacity, error) {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
// Get all active nodes
|
||||
activeNodes, err := cns.getActiveNodes(internalCtx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
cns.logger.Debug("Found active nodes", zap.Int("count", len(activeNodes)))
|
||||
|
||||
// Filter nodes that have capacity for namespace instances
|
||||
eligibleNodes := make([]NodeCapacity, 0)
|
||||
for _, node := range activeNodes {
|
||||
capacity, err := cns.getNodeCapacity(internalCtx, node.NodeID, node.IPAddress)
|
||||
if err != nil {
|
||||
cns.logger.Warn("Failed to get node capacity, skipping",
|
||||
zap.String("node_id", node.NodeID),
|
||||
zap.Error(err),
|
||||
)
|
||||
continue
|
||||
}
|
||||
|
||||
// Only include nodes with available namespace slots
|
||||
if capacity.AvailableNamespaceSlots > 0 {
|
||||
eligibleNodes = append(eligibleNodes, *capacity)
|
||||
} else {
|
||||
cns.logger.Debug("Node at capacity, skipping",
|
||||
zap.String("node_id", node.NodeID),
|
||||
zap.Int("namespace_instances", capacity.NamespaceInstanceCount),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
cns.logger.Debug("Eligible nodes after filtering", zap.Int("count", len(eligibleNodes)))
|
||||
|
||||
// Check if we have enough nodes
|
||||
if len(eligibleNodes) < nodeCount {
|
||||
return nil, &ClusterError{
|
||||
Message: ErrInsufficientNodes.Message,
|
||||
Cause: nil,
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by score (highest first)
|
||||
sort.Slice(eligibleNodes, func(i, j int) bool {
|
||||
return eligibleNodes[i].Score > eligibleNodes[j].Score
|
||||
})
|
||||
|
||||
// Return top N nodes
|
||||
selectedNodes := eligibleNodes[:nodeCount]
|
||||
|
||||
cns.logger.Info("Selected nodes for cluster",
|
||||
zap.Int("requested", nodeCount),
|
||||
zap.Int("selected", len(selectedNodes)),
|
||||
)
|
||||
|
||||
for i, node := range selectedNodes {
|
||||
cns.logger.Debug("Selected node",
|
||||
zap.Int("rank", i+1),
|
||||
zap.String("node_id", node.NodeID),
|
||||
zap.Float64("score", node.Score),
|
||||
zap.Int("namespace_instances", node.NamespaceInstanceCount),
|
||||
zap.Int("available_slots", node.AvailableNamespaceSlots),
|
||||
)
|
||||
}
|
||||
|
||||
return selectedNodes, nil
|
||||
}
|
||||
|
||||
// nodeInfo is used for querying active nodes
|
||||
type nodeInfo struct {
|
||||
NodeID string `db:"id"`
|
||||
IPAddress string `db:"ip_address"`
|
||||
}
|
||||
|
||||
// getActiveNodes retrieves all active nodes from dns_nodes table
|
||||
func (cns *ClusterNodeSelector) getActiveNodes(ctx context.Context) ([]nodeInfo, error) {
|
||||
// Nodes must have checked in within last 2 minutes
|
||||
cutoff := time.Now().Add(-2 * time.Minute)
|
||||
|
||||
var results []nodeInfo
|
||||
query := `
|
||||
SELECT id, ip_address FROM dns_nodes
|
||||
WHERE status = 'active' AND last_seen > ?
|
||||
ORDER BY id
|
||||
`
|
||||
err := cns.db.Query(ctx, &results, query, cutoff.Format("2006-01-02 15:04:05"))
|
||||
if err != nil {
|
||||
return nil, &ClusterError{
|
||||
Message: "failed to query active nodes",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
cns.logger.Debug("Found active nodes",
|
||||
zap.Int("count", len(results)),
|
||||
)
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
||||
// getNodeCapacity calculates capacity metrics for a single node
|
||||
func (cns *ClusterNodeSelector) getNodeCapacity(ctx context.Context, nodeID, ipAddress string) (*NodeCapacity, error) {
|
||||
// Get deployment count
|
||||
deploymentCount, err := cns.getDeploymentCount(ctx, nodeID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Get allocated deployment ports
|
||||
allocatedPorts, err := cns.getDeploymentPortCount(ctx, nodeID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Get resource usage from home_node_assignments
|
||||
totalMemoryMB, totalCPUPercent, err := cns.getNodeResourceUsage(ctx, nodeID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Get namespace instance count
|
||||
namespaceInstanceCount, err := cns.portAllocator.GetNodeAllocationCount(ctx, nodeID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Calculate available capacity
|
||||
const (
|
||||
maxDeployments = 100
|
||||
maxPorts = 9900 // User deployment port range
|
||||
maxMemoryMB = 8192 // 8GB
|
||||
maxCPUPercent = 400 // 4 cores
|
||||
)
|
||||
|
||||
availablePorts := maxPorts - allocatedPorts
|
||||
if availablePorts < 0 {
|
||||
availablePorts = 0
|
||||
}
|
||||
|
||||
availableMemoryMB := maxMemoryMB - totalMemoryMB
|
||||
if availableMemoryMB < 0 {
|
||||
availableMemoryMB = 0
|
||||
}
|
||||
|
||||
availableNamespaceSlots := MaxNamespacesPerNode - namespaceInstanceCount
|
||||
if availableNamespaceSlots < 0 {
|
||||
availableNamespaceSlots = 0
|
||||
}
|
||||
|
||||
// Calculate capacity score (0.0 to 1.0, higher is better)
|
||||
// Extended from home_node.go to include namespace instance count
|
||||
score := cns.calculateCapacityScore(
|
||||
deploymentCount, maxDeployments,
|
||||
allocatedPorts, maxPorts,
|
||||
totalMemoryMB, maxMemoryMB,
|
||||
totalCPUPercent, maxCPUPercent,
|
||||
namespaceInstanceCount, MaxNamespacesPerNode,
|
||||
)
|
||||
|
||||
capacity := &NodeCapacity{
|
||||
NodeID: nodeID,
|
||||
IPAddress: ipAddress,
|
||||
DeploymentCount: deploymentCount,
|
||||
AllocatedPorts: allocatedPorts,
|
||||
AvailablePorts: availablePorts,
|
||||
UsedMemoryMB: totalMemoryMB,
|
||||
AvailableMemoryMB: availableMemoryMB,
|
||||
UsedCPUPercent: totalCPUPercent,
|
||||
NamespaceInstanceCount: namespaceInstanceCount,
|
||||
AvailableNamespaceSlots: availableNamespaceSlots,
|
||||
Score: score,
|
||||
}
|
||||
|
||||
return capacity, nil
|
||||
}
|
||||
|
||||
// getDeploymentCount counts active deployments on a node
|
||||
func (cns *ClusterNodeSelector) getDeploymentCount(ctx context.Context, nodeID string) (int, error) {
|
||||
type countResult struct {
|
||||
Count int `db:"count"`
|
||||
}
|
||||
|
||||
var results []countResult
|
||||
query := `SELECT COUNT(*) as count FROM deployments WHERE home_node_id = ? AND status IN ('active', 'deploying')`
|
||||
err := cns.db.Query(ctx, &results, query, nodeID)
|
||||
if err != nil {
|
||||
return 0, &ClusterError{
|
||||
Message: "failed to count deployments",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
if len(results) == 0 {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
return results[0].Count, nil
|
||||
}
|
||||
|
||||
// getDeploymentPortCount counts allocated deployment ports on a node
|
||||
func (cns *ClusterNodeSelector) getDeploymentPortCount(ctx context.Context, nodeID string) (int, error) {
|
||||
type countResult struct {
|
||||
Count int `db:"count"`
|
||||
}
|
||||
|
||||
var results []countResult
|
||||
query := `SELECT COUNT(*) as count FROM port_allocations WHERE node_id = ?`
|
||||
err := cns.db.Query(ctx, &results, query, nodeID)
|
||||
if err != nil {
|
||||
return 0, &ClusterError{
|
||||
Message: "failed to count allocated ports",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
if len(results) == 0 {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
return results[0].Count, nil
|
||||
}
|
||||
|
||||
// getNodeResourceUsage sums up resource usage for all namespaces on a node
|
||||
func (cns *ClusterNodeSelector) getNodeResourceUsage(ctx context.Context, nodeID string) (int, int, error) {
|
||||
type resourceResult struct {
|
||||
TotalMemoryMB int `db:"total_memory"`
|
||||
TotalCPUPercent int `db:"total_cpu"`
|
||||
}
|
||||
|
||||
var results []resourceResult
|
||||
query := `
|
||||
SELECT
|
||||
COALESCE(SUM(total_memory_mb), 0) as total_memory,
|
||||
COALESCE(SUM(total_cpu_percent), 0) as total_cpu
|
||||
FROM home_node_assignments
|
||||
WHERE home_node_id = ?
|
||||
`
|
||||
err := cns.db.Query(ctx, &results, query, nodeID)
|
||||
if err != nil {
|
||||
return 0, 0, &ClusterError{
|
||||
Message: "failed to query resource usage",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
if len(results) == 0 {
|
||||
return 0, 0, nil
|
||||
}
|
||||
|
||||
return results[0].TotalMemoryMB, results[0].TotalCPUPercent, nil
|
||||
}
|
||||
|
||||
// calculateCapacityScore calculates a weighted capacity score (0.0 to 1.0)
|
||||
// Higher scores indicate more available capacity
|
||||
func (cns *ClusterNodeSelector) calculateCapacityScore(
|
||||
deploymentCount, maxDeployments int,
|
||||
allocatedPorts, maxPorts int,
|
||||
usedMemoryMB, maxMemoryMB int,
|
||||
usedCPUPercent, maxCPUPercent int,
|
||||
namespaceInstances, maxNamespaceInstances int,
|
||||
) float64 {
|
||||
// Calculate individual component scores (0.0 to 1.0)
|
||||
deploymentScore := 1.0 - (float64(deploymentCount) / float64(maxDeployments))
|
||||
if deploymentScore < 0 {
|
||||
deploymentScore = 0
|
||||
}
|
||||
|
||||
portScore := 1.0 - (float64(allocatedPorts) / float64(maxPorts))
|
||||
if portScore < 0 {
|
||||
portScore = 0
|
||||
}
|
||||
|
||||
memoryScore := 1.0 - (float64(usedMemoryMB) / float64(maxMemoryMB))
|
||||
if memoryScore < 0 {
|
||||
memoryScore = 0
|
||||
}
|
||||
|
||||
cpuScore := 1.0 - (float64(usedCPUPercent) / float64(maxCPUPercent))
|
||||
if cpuScore < 0 {
|
||||
cpuScore = 0
|
||||
}
|
||||
|
||||
namespaceScore := 1.0 - (float64(namespaceInstances) / float64(maxNamespaceInstances))
|
||||
if namespaceScore < 0 {
|
||||
namespaceScore = 0
|
||||
}
|
||||
|
||||
// Weighted average
|
||||
// Namespace instance count gets significant weight since that's what we're optimizing for
|
||||
// Weights: deployments 30%, ports 15%, memory 15%, cpu 15%, namespace instances 25%
|
||||
totalScore := (deploymentScore * 0.30) +
|
||||
(portScore * 0.15) +
|
||||
(memoryScore * 0.15) +
|
||||
(cpuScore * 0.15) +
|
||||
(namespaceScore * 0.25)
|
||||
|
||||
cns.logger.Debug("Calculated capacity score",
|
||||
zap.Int("deployments", deploymentCount),
|
||||
zap.Int("allocated_ports", allocatedPorts),
|
||||
zap.Int("used_memory_mb", usedMemoryMB),
|
||||
zap.Int("used_cpu_percent", usedCPUPercent),
|
||||
zap.Int("namespace_instances", namespaceInstances),
|
||||
zap.Float64("deployment_score", deploymentScore),
|
||||
zap.Float64("port_score", portScore),
|
||||
zap.Float64("memory_score", memoryScore),
|
||||
zap.Float64("cpu_score", cpuScore),
|
||||
zap.Float64("namespace_score", namespaceScore),
|
||||
zap.Float64("total_score", totalScore),
|
||||
)
|
||||
|
||||
return totalScore
|
||||
}
|
||||
|
||||
// GetNodeByID retrieves a node's information by ID
|
||||
func (cns *ClusterNodeSelector) GetNodeByID(ctx context.Context, nodeID string) (*nodeInfo, error) {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
var results []nodeInfo
|
||||
query := `SELECT id, ip_address FROM dns_nodes WHERE id = ? LIMIT 1`
|
||||
err := cns.db.Query(internalCtx, &results, query, nodeID)
|
||||
if err != nil {
|
||||
return nil, &ClusterError{
|
||||
Message: "failed to query node",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
if len(results) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return &results[0], nil
|
||||
}
|
||||
227
pkg/namespace/node_selector_test.go
Normal file
227
pkg/namespace/node_selector_test.go
Normal file
@ -0,0 +1,227 @@
|
||||
package namespace
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
func TestCalculateCapacityScore_EmptyNode(t *testing.T) {
|
||||
logger := zap.NewNop()
|
||||
mockDB := newMockRQLiteClient()
|
||||
portAllocator := NewNamespacePortAllocator(mockDB, logger)
|
||||
selector := NewClusterNodeSelector(mockDB, portAllocator, logger)
|
||||
|
||||
// Empty node should have score of 1.0 (100% available)
|
||||
score := selector.calculateCapacityScore(
|
||||
0, 100, // deployments
|
||||
0, 9900, // ports
|
||||
0, 8192, // memory
|
||||
0, 400, // cpu
|
||||
0, 20, // namespace instances
|
||||
)
|
||||
|
||||
if score != 1.0 {
|
||||
t.Errorf("Empty node score = %f, want 1.0", score)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculateCapacityScore_FullNode(t *testing.T) {
|
||||
logger := zap.NewNop()
|
||||
mockDB := newMockRQLiteClient()
|
||||
portAllocator := NewNamespacePortAllocator(mockDB, logger)
|
||||
selector := NewClusterNodeSelector(mockDB, portAllocator, logger)
|
||||
|
||||
// Full node should have score of 0.0 (0% available)
|
||||
score := selector.calculateCapacityScore(
|
||||
100, 100, // deployments (full)
|
||||
9900, 9900, // ports (full)
|
||||
8192, 8192, // memory (full)
|
||||
400, 400, // cpu (full)
|
||||
20, 20, // namespace instances (full)
|
||||
)
|
||||
|
||||
if score != 0.0 {
|
||||
t.Errorf("Full node score = %f, want 0.0", score)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculateCapacityScore_HalfCapacity(t *testing.T) {
|
||||
logger := zap.NewNop()
|
||||
mockDB := newMockRQLiteClient()
|
||||
portAllocator := NewNamespacePortAllocator(mockDB, logger)
|
||||
selector := NewClusterNodeSelector(mockDB, portAllocator, logger)
|
||||
|
||||
// Half-full node should have score of approximately 0.5
|
||||
score := selector.calculateCapacityScore(
|
||||
50, 100, // 50% deployments
|
||||
4950, 9900, // 50% ports
|
||||
4096, 8192, // 50% memory
|
||||
200, 400, // 50% cpu
|
||||
10, 20, // 50% namespace instances
|
||||
)
|
||||
|
||||
// With all components at 50%, the weighted average should be 0.5
|
||||
expected := 0.5
|
||||
tolerance := 0.01
|
||||
|
||||
if score < expected-tolerance || score > expected+tolerance {
|
||||
t.Errorf("Half capacity score = %f, want approximately %f", score, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculateCapacityScore_Weights(t *testing.T) {
|
||||
logger := zap.NewNop()
|
||||
mockDB := newMockRQLiteClient()
|
||||
portAllocator := NewNamespacePortAllocator(mockDB, logger)
|
||||
selector := NewClusterNodeSelector(mockDB, portAllocator, logger)
|
||||
|
||||
// Test that deployment weight is 30%, namespace instance weight is 25%
|
||||
// Only deployments full (other metrics empty)
|
||||
deploymentOnlyScore := selector.calculateCapacityScore(
|
||||
100, 100, // deployments full (contributes 0 * 0.30 = 0)
|
||||
0, 9900, // ports empty (contributes 1.0 * 0.15 = 0.15)
|
||||
0, 8192, // memory empty (contributes 1.0 * 0.15 = 0.15)
|
||||
0, 400, // cpu empty (contributes 1.0 * 0.15 = 0.15)
|
||||
0, 20, // namespace instances empty (contributes 1.0 * 0.25 = 0.25)
|
||||
)
|
||||
// Expected: 0 + 0.15 + 0.15 + 0.15 + 0.25 = 0.70
|
||||
expectedDeploymentOnly := 0.70
|
||||
tolerance := 0.01
|
||||
|
||||
if deploymentOnlyScore < expectedDeploymentOnly-tolerance || deploymentOnlyScore > expectedDeploymentOnly+tolerance {
|
||||
t.Errorf("Deployment-only-full score = %f, want %f", deploymentOnlyScore, expectedDeploymentOnly)
|
||||
}
|
||||
|
||||
// Only namespace instances full (other metrics empty)
|
||||
namespaceOnlyScore := selector.calculateCapacityScore(
|
||||
0, 100, // deployments empty (contributes 1.0 * 0.30 = 0.30)
|
||||
0, 9900, // ports empty (contributes 1.0 * 0.15 = 0.15)
|
||||
0, 8192, // memory empty (contributes 1.0 * 0.15 = 0.15)
|
||||
0, 400, // cpu empty (contributes 1.0 * 0.15 = 0.15)
|
||||
20, 20, // namespace instances full (contributes 0 * 0.25 = 0)
|
||||
)
|
||||
// Expected: 0.30 + 0.15 + 0.15 + 0.15 + 0 = 0.75
|
||||
expectedNamespaceOnly := 0.75
|
||||
|
||||
if namespaceOnlyScore < expectedNamespaceOnly-tolerance || namespaceOnlyScore > expectedNamespaceOnly+tolerance {
|
||||
t.Errorf("Namespace-only-full score = %f, want %f", namespaceOnlyScore, expectedNamespaceOnly)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculateCapacityScore_NegativeValues(t *testing.T) {
|
||||
logger := zap.NewNop()
|
||||
mockDB := newMockRQLiteClient()
|
||||
portAllocator := NewNamespacePortAllocator(mockDB, logger)
|
||||
selector := NewClusterNodeSelector(mockDB, portAllocator, logger)
|
||||
|
||||
// Test that over-capacity values (which would produce negative scores) are clamped to 0
|
||||
score := selector.calculateCapacityScore(
|
||||
200, 100, // 200% deployments (should clamp to 0)
|
||||
20000, 9900, // over ports (should clamp to 0)
|
||||
16000, 8192, // over memory (should clamp to 0)
|
||||
800, 400, // over cpu (should clamp to 0)
|
||||
40, 20, // over namespace instances (should clamp to 0)
|
||||
)
|
||||
|
||||
if score != 0.0 {
|
||||
t.Errorf("Over-capacity score = %f, want 0.0", score)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNodeCapacity_AvailableSlots(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
instanceCount int
|
||||
expectedAvailable int
|
||||
}{
|
||||
{"Empty node", 0, 20},
|
||||
{"One instance", 1, 19},
|
||||
{"Half full", 10, 10},
|
||||
{"Almost full", 19, 1},
|
||||
{"Full", 20, 0},
|
||||
{"Over capacity", 25, 0}, // Should clamp to 0
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
available := MaxNamespacesPerNode - tt.instanceCount
|
||||
if available < 0 {
|
||||
available = 0
|
||||
}
|
||||
if available != tt.expectedAvailable {
|
||||
t.Errorf("Available slots for %d instances = %d, want %d",
|
||||
tt.instanceCount, available, tt.expectedAvailable)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewClusterNodeSelector(t *testing.T) {
|
||||
logger := zap.NewNop()
|
||||
mockDB := newMockRQLiteClient()
|
||||
portAllocator := NewNamespacePortAllocator(mockDB, logger)
|
||||
|
||||
selector := NewClusterNodeSelector(mockDB, portAllocator, logger)
|
||||
|
||||
if selector == nil {
|
||||
t.Fatal("NewClusterNodeSelector returned nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNodeCapacityStruct(t *testing.T) {
|
||||
// Test NodeCapacity struct initialization
|
||||
capacity := NodeCapacity{
|
||||
NodeID: "node-123",
|
||||
IPAddress: "192.168.1.100",
|
||||
DeploymentCount: 10,
|
||||
AllocatedPorts: 50,
|
||||
AvailablePorts: 9850,
|
||||
UsedMemoryMB: 2048,
|
||||
AvailableMemoryMB: 6144,
|
||||
UsedCPUPercent: 100,
|
||||
NamespaceInstanceCount: 5,
|
||||
AvailableNamespaceSlots: 15,
|
||||
Score: 0.75,
|
||||
}
|
||||
|
||||
if capacity.NodeID != "node-123" {
|
||||
t.Errorf("NodeID = %s, want node-123", capacity.NodeID)
|
||||
}
|
||||
if capacity.AvailableNamespaceSlots != 15 {
|
||||
t.Errorf("AvailableNamespaceSlots = %d, want 15", capacity.AvailableNamespaceSlots)
|
||||
}
|
||||
if capacity.Score != 0.75 {
|
||||
t.Errorf("Score = %f, want 0.75", capacity.Score)
|
||||
}
|
||||
}
|
||||
|
||||
func TestScoreRanking(t *testing.T) {
|
||||
// Test that higher scores indicate more available capacity
|
||||
logger := zap.NewNop()
|
||||
mockDB := newMockRQLiteClient()
|
||||
portAllocator := NewNamespacePortAllocator(mockDB, logger)
|
||||
selector := NewClusterNodeSelector(mockDB, portAllocator, logger)
|
||||
|
||||
// Node A: Light load
|
||||
scoreA := selector.calculateCapacityScore(
|
||||
10, 100, // 10% deployments
|
||||
500, 9900, // ~5% ports
|
||||
1000, 8192,// ~12% memory
|
||||
50, 400, // ~12% cpu
|
||||
2, 20, // 10% namespace instances
|
||||
)
|
||||
|
||||
// Node B: Heavy load
|
||||
scoreB := selector.calculateCapacityScore(
|
||||
80, 100, // 80% deployments
|
||||
8000, 9900, // ~80% ports
|
||||
7000, 8192, // ~85% memory
|
||||
350, 400, // ~87% cpu
|
||||
18, 20, // 90% namespace instances
|
||||
)
|
||||
|
||||
if scoreA <= scoreB {
|
||||
t.Errorf("Light load score (%f) should be higher than heavy load score (%f)", scoreA, scoreB)
|
||||
}
|
||||
}
|
||||
341
pkg/namespace/port_allocator.go
Normal file
341
pkg/namespace/port_allocator.go
Normal file
@ -0,0 +1,341 @@
|
||||
package namespace
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/client"
|
||||
"github.com/DeBrosOfficial/network/pkg/rqlite"
|
||||
"github.com/google/uuid"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// NamespacePortAllocator manages the reserved port range (10000-10099) for namespace services.
|
||||
// Each namespace instance on a node gets a block of 5 consecutive ports.
|
||||
type NamespacePortAllocator struct {
|
||||
db rqlite.Client
|
||||
logger *zap.Logger
|
||||
}
|
||||
|
||||
// NewNamespacePortAllocator creates a new port allocator
|
||||
func NewNamespacePortAllocator(db rqlite.Client, logger *zap.Logger) *NamespacePortAllocator {
|
||||
return &NamespacePortAllocator{
|
||||
db: db,
|
||||
logger: logger.With(zap.String("component", "namespace-port-allocator")),
|
||||
}
|
||||
}
|
||||
|
||||
// AllocatePortBlock finds and allocates the next available 5-port block on a node.
|
||||
// Returns an error if the node is at capacity (20 namespace instances).
|
||||
func (npa *NamespacePortAllocator) AllocatePortBlock(ctx context.Context, nodeID, namespaceClusterID string) (*PortBlock, error) {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
// Check if allocation already exists for this namespace on this node
|
||||
existingBlock, err := npa.GetPortBlock(ctx, namespaceClusterID, nodeID)
|
||||
if err == nil && existingBlock != nil {
|
||||
npa.logger.Debug("Port block already allocated",
|
||||
zap.String("node_id", nodeID),
|
||||
zap.String("namespace_cluster_id", namespaceClusterID),
|
||||
zap.Int("port_start", existingBlock.PortStart),
|
||||
)
|
||||
return existingBlock, nil
|
||||
}
|
||||
|
||||
// Retry logic for handling concurrent allocation conflicts
|
||||
maxRetries := 10
|
||||
retryDelay := 100 * time.Millisecond
|
||||
|
||||
for attempt := 0; attempt < maxRetries; attempt++ {
|
||||
block, err := npa.tryAllocatePortBlock(internalCtx, nodeID, namespaceClusterID)
|
||||
if err == nil {
|
||||
npa.logger.Info("Port block allocated successfully",
|
||||
zap.String("node_id", nodeID),
|
||||
zap.String("namespace_cluster_id", namespaceClusterID),
|
||||
zap.Int("port_start", block.PortStart),
|
||||
zap.Int("attempt", attempt+1),
|
||||
)
|
||||
return block, nil
|
||||
}
|
||||
|
||||
// If it's a conflict error, retry with exponential backoff
|
||||
if isConflictError(err) {
|
||||
npa.logger.Debug("Port allocation conflict, retrying",
|
||||
zap.String("node_id", nodeID),
|
||||
zap.String("namespace_cluster_id", namespaceClusterID),
|
||||
zap.Int("attempt", attempt+1),
|
||||
zap.Error(err),
|
||||
)
|
||||
time.Sleep(retryDelay)
|
||||
retryDelay *= 2
|
||||
continue
|
||||
}
|
||||
|
||||
// Other errors are non-retryable
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return nil, &ClusterError{
|
||||
Message: fmt.Sprintf("failed to allocate port block after %d retries", maxRetries),
|
||||
}
|
||||
}
|
||||
|
||||
// tryAllocatePortBlock attempts to allocate a port block (single attempt)
|
||||
func (npa *NamespacePortAllocator) tryAllocatePortBlock(ctx context.Context, nodeID, namespaceClusterID string) (*PortBlock, error) {
|
||||
// Query all allocated port blocks on this node
|
||||
type portRow struct {
|
||||
PortStart int `db:"port_start"`
|
||||
}
|
||||
|
||||
var allocatedBlocks []portRow
|
||||
query := `SELECT port_start FROM namespace_port_allocations WHERE node_id = ? ORDER BY port_start ASC`
|
||||
err := npa.db.Query(ctx, &allocatedBlocks, query, nodeID)
|
||||
if err != nil {
|
||||
return nil, &ClusterError{
|
||||
Message: "failed to query allocated ports",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
// Build map of allocated block starts
|
||||
allocatedStarts := make(map[int]bool)
|
||||
for _, row := range allocatedBlocks {
|
||||
allocatedStarts[row.PortStart] = true
|
||||
}
|
||||
|
||||
// Check node capacity
|
||||
if len(allocatedBlocks) >= MaxNamespacesPerNode {
|
||||
return nil, ErrNodeAtCapacity
|
||||
}
|
||||
|
||||
// Find first available port block
|
||||
portStart := -1
|
||||
for start := NamespacePortRangeStart; start <= NamespacePortRangeEnd-PortsPerNamespace+1; start += PortsPerNamespace {
|
||||
if !allocatedStarts[start] {
|
||||
portStart = start
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if portStart < 0 {
|
||||
return nil, ErrNoPortsAvailable
|
||||
}
|
||||
|
||||
// Create port block
|
||||
block := &PortBlock{
|
||||
ID: uuid.New().String(),
|
||||
NodeID: nodeID,
|
||||
NamespaceClusterID: namespaceClusterID,
|
||||
PortStart: portStart,
|
||||
PortEnd: portStart + PortsPerNamespace - 1,
|
||||
RQLiteHTTPPort: portStart + 0,
|
||||
RQLiteRaftPort: portStart + 1,
|
||||
OlricHTTPPort: portStart + 2,
|
||||
OlricMemberlistPort: portStart + 3,
|
||||
GatewayHTTPPort: portStart + 4,
|
||||
AllocatedAt: time.Now(),
|
||||
}
|
||||
|
||||
// Attempt to insert allocation record
|
||||
insertQuery := `
|
||||
INSERT INTO namespace_port_allocations (
|
||||
id, node_id, namespace_cluster_id, port_start, port_end,
|
||||
rqlite_http_port, rqlite_raft_port, olric_http_port, olric_memberlist_port, gateway_http_port,
|
||||
allocated_at
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
`
|
||||
_, err = npa.db.Exec(ctx, insertQuery,
|
||||
block.ID,
|
||||
block.NodeID,
|
||||
block.NamespaceClusterID,
|
||||
block.PortStart,
|
||||
block.PortEnd,
|
||||
block.RQLiteHTTPPort,
|
||||
block.RQLiteRaftPort,
|
||||
block.OlricHTTPPort,
|
||||
block.OlricMemberlistPort,
|
||||
block.GatewayHTTPPort,
|
||||
block.AllocatedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, &ClusterError{
|
||||
Message: "failed to insert port allocation",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
return block, nil
|
||||
}
|
||||
|
||||
// DeallocatePortBlock releases a port block when a namespace is deprovisioned
|
||||
func (npa *NamespacePortAllocator) DeallocatePortBlock(ctx context.Context, namespaceClusterID, nodeID string) error {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
query := `DELETE FROM namespace_port_allocations WHERE namespace_cluster_id = ? AND node_id = ?`
|
||||
_, err := npa.db.Exec(internalCtx, query, namespaceClusterID, nodeID)
|
||||
if err != nil {
|
||||
return &ClusterError{
|
||||
Message: "failed to deallocate port block",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
npa.logger.Info("Port block deallocated",
|
||||
zap.String("namespace_cluster_id", namespaceClusterID),
|
||||
zap.String("node_id", nodeID),
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// DeallocateAllPortBlocks releases all port blocks for a namespace cluster
|
||||
func (npa *NamespacePortAllocator) DeallocateAllPortBlocks(ctx context.Context, namespaceClusterID string) error {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
query := `DELETE FROM namespace_port_allocations WHERE namespace_cluster_id = ?`
|
||||
_, err := npa.db.Exec(internalCtx, query, namespaceClusterID)
|
||||
if err != nil {
|
||||
return &ClusterError{
|
||||
Message: "failed to deallocate all port blocks",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
npa.logger.Info("All port blocks deallocated",
|
||||
zap.String("namespace_cluster_id", namespaceClusterID),
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetPortBlock retrieves the port block for a namespace on a specific node
|
||||
func (npa *NamespacePortAllocator) GetPortBlock(ctx context.Context, namespaceClusterID, nodeID string) (*PortBlock, error) {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
var blocks []PortBlock
|
||||
query := `
|
||||
SELECT id, node_id, namespace_cluster_id, port_start, port_end,
|
||||
rqlite_http_port, rqlite_raft_port, olric_http_port, olric_memberlist_port, gateway_http_port,
|
||||
allocated_at
|
||||
FROM namespace_port_allocations
|
||||
WHERE namespace_cluster_id = ? AND node_id = ?
|
||||
LIMIT 1
|
||||
`
|
||||
err := npa.db.Query(internalCtx, &blocks, query, namespaceClusterID, nodeID)
|
||||
if err != nil {
|
||||
return nil, &ClusterError{
|
||||
Message: "failed to query port block",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
if len(blocks) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return &blocks[0], nil
|
||||
}
|
||||
|
||||
// GetAllPortBlocks retrieves all port blocks for a namespace cluster
|
||||
func (npa *NamespacePortAllocator) GetAllPortBlocks(ctx context.Context, namespaceClusterID string) ([]PortBlock, error) {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
var blocks []PortBlock
|
||||
query := `
|
||||
SELECT id, node_id, namespace_cluster_id, port_start, port_end,
|
||||
rqlite_http_port, rqlite_raft_port, olric_http_port, olric_memberlist_port, gateway_http_port,
|
||||
allocated_at
|
||||
FROM namespace_port_allocations
|
||||
WHERE namespace_cluster_id = ?
|
||||
ORDER BY port_start ASC
|
||||
`
|
||||
err := npa.db.Query(internalCtx, &blocks, query, namespaceClusterID)
|
||||
if err != nil {
|
||||
return nil, &ClusterError{
|
||||
Message: "failed to query port blocks",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
return blocks, nil
|
||||
}
|
||||
|
||||
// GetNodeCapacity returns how many more namespace instances a node can host
|
||||
func (npa *NamespacePortAllocator) GetNodeCapacity(ctx context.Context, nodeID string) (int, error) {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
type countResult struct {
|
||||
Count int `db:"count"`
|
||||
}
|
||||
|
||||
var results []countResult
|
||||
query := `SELECT COUNT(*) as count FROM namespace_port_allocations WHERE node_id = ?`
|
||||
err := npa.db.Query(internalCtx, &results, query, nodeID)
|
||||
if err != nil {
|
||||
return 0, &ClusterError{
|
||||
Message: "failed to count allocated port blocks",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
if len(results) == 0 {
|
||||
return MaxNamespacesPerNode, nil
|
||||
}
|
||||
|
||||
allocated := results[0].Count
|
||||
available := MaxNamespacesPerNode - allocated
|
||||
|
||||
if available < 0 {
|
||||
available = 0
|
||||
}
|
||||
|
||||
return available, nil
|
||||
}
|
||||
|
||||
// GetNodeAllocationCount returns the number of namespace instances on a node
|
||||
func (npa *NamespacePortAllocator) GetNodeAllocationCount(ctx context.Context, nodeID string) (int, error) {
|
||||
internalCtx := client.WithInternalAuth(ctx)
|
||||
|
||||
type countResult struct {
|
||||
Count int `db:"count"`
|
||||
}
|
||||
|
||||
var results []countResult
|
||||
query := `SELECT COUNT(*) as count FROM namespace_port_allocations WHERE node_id = ?`
|
||||
err := npa.db.Query(internalCtx, &results, query, nodeID)
|
||||
if err != nil {
|
||||
return 0, &ClusterError{
|
||||
Message: "failed to count allocated port blocks",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
if len(results) == 0 {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
return results[0].Count, nil
|
||||
}
|
||||
|
||||
// isConflictError checks if an error is due to a constraint violation
|
||||
func isConflictError(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
errStr := err.Error()
|
||||
return contains(errStr, "UNIQUE") || contains(errStr, "constraint") || contains(errStr, "conflict")
|
||||
}
|
||||
|
||||
// contains checks if a string contains a substring (case-insensitive)
|
||||
func contains(s, substr string) bool {
|
||||
return len(s) >= len(substr) && (s == substr || len(s) > len(substr) && findSubstring(s, substr))
|
||||
}
|
||||
|
||||
func findSubstring(s, substr string) bool {
|
||||
for i := 0; i <= len(s)-len(substr); i++ {
|
||||
if s[i:i+len(substr)] == substr {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
310
pkg/namespace/port_allocator_test.go
Normal file
310
pkg/namespace/port_allocator_test.go
Normal file
@ -0,0 +1,310 @@
|
||||
package namespace
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/rqlite"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// mockResult implements sql.Result
|
||||
type mockResult struct {
|
||||
lastInsertID int64
|
||||
rowsAffected int64
|
||||
}
|
||||
|
||||
func (m mockResult) LastInsertId() (int64, error) { return m.lastInsertID, nil }
|
||||
func (m mockResult) RowsAffected() (int64, error) { return m.rowsAffected, nil }
|
||||
|
||||
// mockRQLiteClient implements rqlite.Client for testing
|
||||
type mockRQLiteClient struct {
|
||||
queryResults map[string]interface{}
|
||||
execResults map[string]error
|
||||
queryCalls []mockQueryCall
|
||||
execCalls []mockExecCall
|
||||
}
|
||||
|
||||
type mockQueryCall struct {
|
||||
Query string
|
||||
Args []interface{}
|
||||
}
|
||||
|
||||
type mockExecCall struct {
|
||||
Query string
|
||||
Args []interface{}
|
||||
}
|
||||
|
||||
func newMockRQLiteClient() *mockRQLiteClient {
|
||||
return &mockRQLiteClient{
|
||||
queryResults: make(map[string]interface{}),
|
||||
execResults: make(map[string]error),
|
||||
queryCalls: make([]mockQueryCall, 0),
|
||||
execCalls: make([]mockExecCall, 0),
|
||||
}
|
||||
}
|
||||
|
||||
func (m *mockRQLiteClient) Query(ctx context.Context, dest any, query string, args ...any) error {
|
||||
ifaceArgs := make([]interface{}, len(args))
|
||||
for i, a := range args {
|
||||
ifaceArgs[i] = a
|
||||
}
|
||||
m.queryCalls = append(m.queryCalls, mockQueryCall{Query: query, Args: ifaceArgs})
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockRQLiteClient) Exec(ctx context.Context, query string, args ...any) (sql.Result, error) {
|
||||
ifaceArgs := make([]interface{}, len(args))
|
||||
for i, a := range args {
|
||||
ifaceArgs[i] = a
|
||||
}
|
||||
m.execCalls = append(m.execCalls, mockExecCall{Query: query, Args: ifaceArgs})
|
||||
if err, ok := m.execResults[query]; ok {
|
||||
return nil, err
|
||||
}
|
||||
return mockResult{rowsAffected: 1}, nil
|
||||
}
|
||||
|
||||
func (m *mockRQLiteClient) FindBy(ctx context.Context, dest any, table string, criteria map[string]any, opts ...rqlite.FindOption) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockRQLiteClient) FindOneBy(ctx context.Context, dest any, table string, criteria map[string]any, opts ...rqlite.FindOption) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockRQLiteClient) Save(ctx context.Context, entity any) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockRQLiteClient) Remove(ctx context.Context, entity any) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockRQLiteClient) Repository(table string) any {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockRQLiteClient) CreateQueryBuilder(table string) *rqlite.QueryBuilder {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockRQLiteClient) Tx(ctx context.Context, fn func(tx rqlite.Tx) error) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Ensure mockRQLiteClient implements rqlite.Client
|
||||
var _ rqlite.Client = (*mockRQLiteClient)(nil)
|
||||
|
||||
func TestPortBlock_PortAssignment(t *testing.T) {
|
||||
// Test that port block correctly assigns ports
|
||||
block := &PortBlock{
|
||||
ID: "test-id",
|
||||
NodeID: "node-1",
|
||||
NamespaceClusterID: "cluster-1",
|
||||
PortStart: 10000,
|
||||
PortEnd: 10004,
|
||||
RQLiteHTTPPort: 10000,
|
||||
RQLiteRaftPort: 10001,
|
||||
OlricHTTPPort: 10002,
|
||||
OlricMemberlistPort: 10003,
|
||||
GatewayHTTPPort: 10004,
|
||||
AllocatedAt: time.Now(),
|
||||
}
|
||||
|
||||
// Verify port assignments
|
||||
if block.RQLiteHTTPPort != block.PortStart+0 {
|
||||
t.Errorf("RQLiteHTTPPort = %d, want %d", block.RQLiteHTTPPort, block.PortStart+0)
|
||||
}
|
||||
if block.RQLiteRaftPort != block.PortStart+1 {
|
||||
t.Errorf("RQLiteRaftPort = %d, want %d", block.RQLiteRaftPort, block.PortStart+1)
|
||||
}
|
||||
if block.OlricHTTPPort != block.PortStart+2 {
|
||||
t.Errorf("OlricHTTPPort = %d, want %d", block.OlricHTTPPort, block.PortStart+2)
|
||||
}
|
||||
if block.OlricMemberlistPort != block.PortStart+3 {
|
||||
t.Errorf("OlricMemberlistPort = %d, want %d", block.OlricMemberlistPort, block.PortStart+3)
|
||||
}
|
||||
if block.GatewayHTTPPort != block.PortStart+4 {
|
||||
t.Errorf("GatewayHTTPPort = %d, want %d", block.GatewayHTTPPort, block.PortStart+4)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPortConstants(t *testing.T) {
|
||||
// Verify constants are correctly defined
|
||||
if NamespacePortRangeStart != 10000 {
|
||||
t.Errorf("NamespacePortRangeStart = %d, want 10000", NamespacePortRangeStart)
|
||||
}
|
||||
if NamespacePortRangeEnd != 10099 {
|
||||
t.Errorf("NamespacePortRangeEnd = %d, want 10099", NamespacePortRangeEnd)
|
||||
}
|
||||
if PortsPerNamespace != 5 {
|
||||
t.Errorf("PortsPerNamespace = %d, want 5", PortsPerNamespace)
|
||||
}
|
||||
|
||||
// Verify max namespaces calculation: (10099 - 10000 + 1) / 5 = 100 / 5 = 20
|
||||
expectedMax := (NamespacePortRangeEnd - NamespacePortRangeStart + 1) / PortsPerNamespace
|
||||
if MaxNamespacesPerNode != expectedMax {
|
||||
t.Errorf("MaxNamespacesPerNode = %d, want %d", MaxNamespacesPerNode, expectedMax)
|
||||
}
|
||||
if MaxNamespacesPerNode != 20 {
|
||||
t.Errorf("MaxNamespacesPerNode = %d, want 20", MaxNamespacesPerNode)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPortRangeCapacity(t *testing.T) {
|
||||
// Test that 20 namespaces fit exactly in the port range
|
||||
usedPorts := MaxNamespacesPerNode * PortsPerNamespace
|
||||
availablePorts := NamespacePortRangeEnd - NamespacePortRangeStart + 1
|
||||
|
||||
if usedPorts > availablePorts {
|
||||
t.Errorf("Port range overflow: %d ports needed for %d namespaces, but only %d available",
|
||||
usedPorts, MaxNamespacesPerNode, availablePorts)
|
||||
}
|
||||
|
||||
// Verify no wasted ports
|
||||
if usedPorts != availablePorts {
|
||||
t.Logf("Note: %d ports unused in range", availablePorts-usedPorts)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPortBlockAllocation_SequentialBlocks(t *testing.T) {
|
||||
// Verify that sequential port blocks don't overlap
|
||||
blocks := make([]*PortBlock, MaxNamespacesPerNode)
|
||||
|
||||
for i := 0; i < MaxNamespacesPerNode; i++ {
|
||||
portStart := NamespacePortRangeStart + (i * PortsPerNamespace)
|
||||
blocks[i] = &PortBlock{
|
||||
PortStart: portStart,
|
||||
PortEnd: portStart + PortsPerNamespace - 1,
|
||||
RQLiteHTTPPort: portStart + 0,
|
||||
RQLiteRaftPort: portStart + 1,
|
||||
OlricHTTPPort: portStart + 2,
|
||||
OlricMemberlistPort: portStart + 3,
|
||||
GatewayHTTPPort: portStart + 4,
|
||||
}
|
||||
}
|
||||
|
||||
// Verify no overlap between consecutive blocks
|
||||
for i := 0; i < len(blocks)-1; i++ {
|
||||
if blocks[i].PortEnd >= blocks[i+1].PortStart {
|
||||
t.Errorf("Block %d (end=%d) overlaps with block %d (start=%d)",
|
||||
i, blocks[i].PortEnd, i+1, blocks[i+1].PortStart)
|
||||
}
|
||||
}
|
||||
|
||||
// Verify last block doesn't exceed range
|
||||
lastBlock := blocks[len(blocks)-1]
|
||||
if lastBlock.PortEnd > NamespacePortRangeEnd {
|
||||
t.Errorf("Last block exceeds port range: end=%d, max=%d",
|
||||
lastBlock.PortEnd, NamespacePortRangeEnd)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsConflictError(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
err error
|
||||
expected bool
|
||||
}{
|
||||
{
|
||||
name: "nil error",
|
||||
err: nil,
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "UNIQUE constraint error",
|
||||
err: errors.New("UNIQUE constraint failed"),
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "constraint violation",
|
||||
err: errors.New("constraint violation"),
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "conflict error",
|
||||
err: errors.New("conflict detected"),
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "regular error",
|
||||
err: errors.New("connection timeout"),
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "empty error",
|
||||
err: errors.New(""),
|
||||
expected: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := isConflictError(tt.err)
|
||||
if result != tt.expected {
|
||||
t.Errorf("isConflictError(%v) = %v, want %v", tt.err, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestContains(t *testing.T) {
|
||||
tests := []struct {
|
||||
s string
|
||||
substr string
|
||||
expected bool
|
||||
}{
|
||||
{"hello world", "world", true},
|
||||
{"hello world", "hello", true},
|
||||
{"hello world", "xyz", false},
|
||||
{"", "", true},
|
||||
{"hello", "", true},
|
||||
{"", "hello", false},
|
||||
{"UNIQUE constraint", "UNIQUE", true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.s+"_"+tt.substr, func(t *testing.T) {
|
||||
result := contains(tt.s, tt.substr)
|
||||
if result != tt.expected {
|
||||
t.Errorf("contains(%q, %q) = %v, want %v", tt.s, tt.substr, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewNamespacePortAllocator(t *testing.T) {
|
||||
mockDB := newMockRQLiteClient()
|
||||
logger := zap.NewNop()
|
||||
|
||||
allocator := NewNamespacePortAllocator(mockDB, logger)
|
||||
|
||||
if allocator == nil {
|
||||
t.Fatal("NewNamespacePortAllocator returned nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDefaultClusterSizes(t *testing.T) {
|
||||
// Verify default cluster size constants
|
||||
if DefaultRQLiteNodeCount != 3 {
|
||||
t.Errorf("DefaultRQLiteNodeCount = %d, want 3", DefaultRQLiteNodeCount)
|
||||
}
|
||||
if DefaultOlricNodeCount != 3 {
|
||||
t.Errorf("DefaultOlricNodeCount = %d, want 3", DefaultOlricNodeCount)
|
||||
}
|
||||
if DefaultGatewayNodeCount != 3 {
|
||||
t.Errorf("DefaultGatewayNodeCount = %d, want 3", DefaultGatewayNodeCount)
|
||||
}
|
||||
|
||||
// Public namespace should have larger clusters
|
||||
if PublicRQLiteNodeCount != 5 {
|
||||
t.Errorf("PublicRQLiteNodeCount = %d, want 5", PublicRQLiteNodeCount)
|
||||
}
|
||||
if PublicOlricNodeCount != 5 {
|
||||
t.Errorf("PublicOlricNodeCount = %d, want 5", PublicOlricNodeCount)
|
||||
}
|
||||
}
|
||||
204
pkg/namespace/types.go
Normal file
204
pkg/namespace/types.go
Normal file
@ -0,0 +1,204 @@
|
||||
package namespace
|
||||
|
||||
import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// ClusterStatus represents the current state of a namespace cluster
|
||||
type ClusterStatus string
|
||||
|
||||
const (
|
||||
ClusterStatusNone ClusterStatus = "none" // No cluster provisioned
|
||||
ClusterStatusProvisioning ClusterStatus = "provisioning" // Cluster is being provisioned
|
||||
ClusterStatusReady ClusterStatus = "ready" // Cluster is operational
|
||||
ClusterStatusDegraded ClusterStatus = "degraded" // Some nodes are unhealthy
|
||||
ClusterStatusFailed ClusterStatus = "failed" // Cluster failed to provision/operate
|
||||
ClusterStatusDeprovisioning ClusterStatus = "deprovisioning" // Cluster is being deprovisioned
|
||||
)
|
||||
|
||||
// NodeRole represents the role of a node in a namespace cluster
|
||||
type NodeRole string
|
||||
|
||||
const (
|
||||
NodeRoleRQLiteLeader NodeRole = "rqlite_leader"
|
||||
NodeRoleRQLiteFollower NodeRole = "rqlite_follower"
|
||||
NodeRoleOlric NodeRole = "olric"
|
||||
NodeRoleGateway NodeRole = "gateway"
|
||||
)
|
||||
|
||||
// NodeStatus represents the status of a service on a node
|
||||
type NodeStatus string
|
||||
|
||||
const (
|
||||
NodeStatusPending NodeStatus = "pending"
|
||||
NodeStatusStarting NodeStatus = "starting"
|
||||
NodeStatusRunning NodeStatus = "running"
|
||||
NodeStatusStopped NodeStatus = "stopped"
|
||||
NodeStatusFailed NodeStatus = "failed"
|
||||
)
|
||||
|
||||
// EventType represents types of cluster lifecycle events
|
||||
type EventType string
|
||||
|
||||
const (
|
||||
EventProvisioningStarted EventType = "provisioning_started"
|
||||
EventNodesSelected EventType = "nodes_selected"
|
||||
EventPortsAllocated EventType = "ports_allocated"
|
||||
EventRQLiteStarted EventType = "rqlite_started"
|
||||
EventRQLiteJoined EventType = "rqlite_joined"
|
||||
EventRQLiteLeaderElected EventType = "rqlite_leader_elected"
|
||||
EventOlricStarted EventType = "olric_started"
|
||||
EventOlricJoined EventType = "olric_joined"
|
||||
EventGatewayStarted EventType = "gateway_started"
|
||||
EventDNSCreated EventType = "dns_created"
|
||||
EventClusterReady EventType = "cluster_ready"
|
||||
EventClusterDegraded EventType = "cluster_degraded"
|
||||
EventClusterFailed EventType = "cluster_failed"
|
||||
EventNodeFailed EventType = "node_failed"
|
||||
EventNodeRecovered EventType = "node_recovered"
|
||||
EventDeprovisionStarted EventType = "deprovisioning_started"
|
||||
EventDeprovisioned EventType = "deprovisioned"
|
||||
)
|
||||
|
||||
// Port allocation constants
|
||||
const (
|
||||
// NamespacePortRangeStart is the beginning of the reserved port range for namespace services
|
||||
NamespacePortRangeStart = 10000
|
||||
|
||||
// NamespacePortRangeEnd is the end of the reserved port range for namespace services
|
||||
NamespacePortRangeEnd = 10099
|
||||
|
||||
// PortsPerNamespace is the number of ports required per namespace instance on a node
|
||||
// RQLite HTTP (0), RQLite Raft (1), Olric HTTP (2), Olric Memberlist (3), Gateway HTTP (4)
|
||||
PortsPerNamespace = 5
|
||||
|
||||
// MaxNamespacesPerNode is the maximum number of namespace instances a single node can host
|
||||
MaxNamespacesPerNode = (NamespacePortRangeEnd - NamespacePortRangeStart + 1) / PortsPerNamespace // 20
|
||||
)
|
||||
|
||||
// Default cluster sizes
|
||||
const (
|
||||
DefaultRQLiteNodeCount = 3
|
||||
DefaultOlricNodeCount = 3
|
||||
DefaultGatewayNodeCount = 3
|
||||
PublicRQLiteNodeCount = 5
|
||||
PublicOlricNodeCount = 5
|
||||
)
|
||||
|
||||
// NamespaceCluster represents a dedicated cluster for a namespace
|
||||
type NamespaceCluster struct {
|
||||
ID string `json:"id" db:"id"`
|
||||
NamespaceID int `json:"namespace_id" db:"namespace_id"`
|
||||
NamespaceName string `json:"namespace_name" db:"namespace_name"`
|
||||
Status ClusterStatus `json:"status" db:"status"`
|
||||
RQLiteNodeCount int `json:"rqlite_node_count" db:"rqlite_node_count"`
|
||||
OlricNodeCount int `json:"olric_node_count" db:"olric_node_count"`
|
||||
GatewayNodeCount int `json:"gateway_node_count" db:"gateway_node_count"`
|
||||
ProvisionedBy string `json:"provisioned_by" db:"provisioned_by"`
|
||||
ProvisionedAt time.Time `json:"provisioned_at" db:"provisioned_at"`
|
||||
ReadyAt *time.Time `json:"ready_at,omitempty" db:"ready_at"`
|
||||
LastHealthCheck *time.Time `json:"last_health_check,omitempty" db:"last_health_check"`
|
||||
ErrorMessage string `json:"error_message,omitempty" db:"error_message"`
|
||||
RetryCount int `json:"retry_count" db:"retry_count"`
|
||||
|
||||
// Populated by queries, not stored directly
|
||||
Nodes []ClusterNode `json:"nodes,omitempty"`
|
||||
}
|
||||
|
||||
// ClusterNode represents a node participating in a namespace cluster
|
||||
type ClusterNode struct {
|
||||
ID string `json:"id" db:"id"`
|
||||
NamespaceClusterID string `json:"namespace_cluster_id" db:"namespace_cluster_id"`
|
||||
NodeID string `json:"node_id" db:"node_id"`
|
||||
Role NodeRole `json:"role" db:"role"`
|
||||
RQLiteHTTPPort int `json:"rqlite_http_port,omitempty" db:"rqlite_http_port"`
|
||||
RQLiteRaftPort int `json:"rqlite_raft_port,omitempty" db:"rqlite_raft_port"`
|
||||
OlricHTTPPort int `json:"olric_http_port,omitempty" db:"olric_http_port"`
|
||||
OlricMemberlistPort int `json:"olric_memberlist_port,omitempty" db:"olric_memberlist_port"`
|
||||
GatewayHTTPPort int `json:"gateway_http_port,omitempty" db:"gateway_http_port"`
|
||||
Status NodeStatus `json:"status" db:"status"`
|
||||
ProcessPID int `json:"process_pid,omitempty" db:"process_pid"`
|
||||
LastHeartbeat *time.Time `json:"last_heartbeat,omitempty" db:"last_heartbeat"`
|
||||
ErrorMessage string `json:"error_message,omitempty" db:"error_message"`
|
||||
RQLiteJoinAddress string `json:"rqlite_join_address,omitempty" db:"rqlite_join_address"`
|
||||
OlricPeers string `json:"olric_peers,omitempty" db:"olric_peers"` // JSON array
|
||||
CreatedAt time.Time `json:"created_at" db:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at" db:"updated_at"`
|
||||
}
|
||||
|
||||
// PortBlock represents an allocated block of ports for a namespace on a node
|
||||
type PortBlock struct {
|
||||
ID string `json:"id" db:"id"`
|
||||
NodeID string `json:"node_id" db:"node_id"`
|
||||
NamespaceClusterID string `json:"namespace_cluster_id" db:"namespace_cluster_id"`
|
||||
PortStart int `json:"port_start" db:"port_start"`
|
||||
PortEnd int `json:"port_end" db:"port_end"`
|
||||
RQLiteHTTPPort int `json:"rqlite_http_port" db:"rqlite_http_port"`
|
||||
RQLiteRaftPort int `json:"rqlite_raft_port" db:"rqlite_raft_port"`
|
||||
OlricHTTPPort int `json:"olric_http_port" db:"olric_http_port"`
|
||||
OlricMemberlistPort int `json:"olric_memberlist_port" db:"olric_memberlist_port"`
|
||||
GatewayHTTPPort int `json:"gateway_http_port" db:"gateway_http_port"`
|
||||
AllocatedAt time.Time `json:"allocated_at" db:"allocated_at"`
|
||||
}
|
||||
|
||||
// ClusterEvent represents an audit event for cluster lifecycle
|
||||
type ClusterEvent struct {
|
||||
ID string `json:"id" db:"id"`
|
||||
NamespaceClusterID string `json:"namespace_cluster_id" db:"namespace_cluster_id"`
|
||||
EventType EventType `json:"event_type" db:"event_type"`
|
||||
NodeID string `json:"node_id,omitempty" db:"node_id"`
|
||||
Message string `json:"message,omitempty" db:"message"`
|
||||
Metadata string `json:"metadata,omitempty" db:"metadata"` // JSON
|
||||
CreatedAt time.Time `json:"created_at" db:"created_at"`
|
||||
}
|
||||
|
||||
// ClusterProvisioningStatus is the response format for the /v1/namespace/status endpoint
|
||||
type ClusterProvisioningStatus struct {
|
||||
ClusterID string `json:"cluster_id"`
|
||||
Namespace string `json:"namespace"`
|
||||
Status ClusterStatus `json:"status"`
|
||||
Nodes []string `json:"nodes"`
|
||||
RQLiteReady bool `json:"rqlite_ready"`
|
||||
OlricReady bool `json:"olric_ready"`
|
||||
GatewayReady bool `json:"gateway_ready"`
|
||||
DNSReady bool `json:"dns_ready"`
|
||||
Error string `json:"error,omitempty"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
ReadyAt *time.Time `json:"ready_at,omitempty"`
|
||||
}
|
||||
|
||||
// ProvisioningResponse is returned when a new namespace triggers cluster provisioning
|
||||
type ProvisioningResponse struct {
|
||||
Status string `json:"status"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
PollURL string `json:"poll_url"`
|
||||
EstimatedTimeSeconds int `json:"estimated_time_seconds"`
|
||||
}
|
||||
|
||||
// Errors
|
||||
type ClusterError struct {
|
||||
Message string
|
||||
Cause error
|
||||
}
|
||||
|
||||
func (e *ClusterError) Error() string {
|
||||
if e.Cause != nil {
|
||||
return e.Message + ": " + e.Cause.Error()
|
||||
}
|
||||
return e.Message
|
||||
}
|
||||
|
||||
func (e *ClusterError) Unwrap() error {
|
||||
return e.Cause
|
||||
}
|
||||
|
||||
var (
|
||||
ErrNoPortsAvailable = &ClusterError{Message: "no ports available on node"}
|
||||
ErrNodeAtCapacity = &ClusterError{Message: "node has reached maximum namespace instances"}
|
||||
ErrInsufficientNodes = &ClusterError{Message: "insufficient nodes available for cluster"}
|
||||
ErrClusterNotFound = &ClusterError{Message: "namespace cluster not found"}
|
||||
ErrClusterAlreadyExists = &ClusterError{Message: "namespace cluster already exists"}
|
||||
ErrProvisioningFailed = &ClusterError{Message: "cluster provisioning failed"}
|
||||
ErrNamespaceNotFound = &ClusterError{Message: "namespace not found"}
|
||||
ErrInvalidClusterStatus = &ClusterError{Message: "invalid cluster status for operation"}
|
||||
)
|
||||
405
pkg/namespace/types_test.go
Normal file
405
pkg/namespace/types_test.go
Normal file
@ -0,0 +1,405 @@
|
||||
package namespace
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestClusterStatus_Values(t *testing.T) {
|
||||
// Verify all cluster status values are correct
|
||||
tests := []struct {
|
||||
status ClusterStatus
|
||||
expected string
|
||||
}{
|
||||
{ClusterStatusNone, "none"},
|
||||
{ClusterStatusProvisioning, "provisioning"},
|
||||
{ClusterStatusReady, "ready"},
|
||||
{ClusterStatusDegraded, "degraded"},
|
||||
{ClusterStatusFailed, "failed"},
|
||||
{ClusterStatusDeprovisioning, "deprovisioning"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(string(tt.status), func(t *testing.T) {
|
||||
if string(tt.status) != tt.expected {
|
||||
t.Errorf("ClusterStatus = %s, want %s", tt.status, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNodeRole_Values(t *testing.T) {
|
||||
// Verify all node role values are correct
|
||||
tests := []struct {
|
||||
role NodeRole
|
||||
expected string
|
||||
}{
|
||||
{NodeRoleRQLiteLeader, "rqlite_leader"},
|
||||
{NodeRoleRQLiteFollower, "rqlite_follower"},
|
||||
{NodeRoleOlric, "olric"},
|
||||
{NodeRoleGateway, "gateway"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(string(tt.role), func(t *testing.T) {
|
||||
if string(tt.role) != tt.expected {
|
||||
t.Errorf("NodeRole = %s, want %s", tt.role, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNodeStatus_Values(t *testing.T) {
|
||||
// Verify all node status values are correct
|
||||
tests := []struct {
|
||||
status NodeStatus
|
||||
expected string
|
||||
}{
|
||||
{NodeStatusPending, "pending"},
|
||||
{NodeStatusStarting, "starting"},
|
||||
{NodeStatusRunning, "running"},
|
||||
{NodeStatusStopped, "stopped"},
|
||||
{NodeStatusFailed, "failed"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(string(tt.status), func(t *testing.T) {
|
||||
if string(tt.status) != tt.expected {
|
||||
t.Errorf("NodeStatus = %s, want %s", tt.status, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestEventType_Values(t *testing.T) {
|
||||
// Verify all event type values are correct
|
||||
tests := []struct {
|
||||
eventType EventType
|
||||
expected string
|
||||
}{
|
||||
{EventProvisioningStarted, "provisioning_started"},
|
||||
{EventNodesSelected, "nodes_selected"},
|
||||
{EventPortsAllocated, "ports_allocated"},
|
||||
{EventRQLiteStarted, "rqlite_started"},
|
||||
{EventRQLiteJoined, "rqlite_joined"},
|
||||
{EventRQLiteLeaderElected, "rqlite_leader_elected"},
|
||||
{EventOlricStarted, "olric_started"},
|
||||
{EventOlricJoined, "olric_joined"},
|
||||
{EventGatewayStarted, "gateway_started"},
|
||||
{EventDNSCreated, "dns_created"},
|
||||
{EventClusterReady, "cluster_ready"},
|
||||
{EventClusterDegraded, "cluster_degraded"},
|
||||
{EventClusterFailed, "cluster_failed"},
|
||||
{EventNodeFailed, "node_failed"},
|
||||
{EventNodeRecovered, "node_recovered"},
|
||||
{EventDeprovisionStarted, "deprovisioning_started"},
|
||||
{EventDeprovisioned, "deprovisioned"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(string(tt.eventType), func(t *testing.T) {
|
||||
if string(tt.eventType) != tt.expected {
|
||||
t.Errorf("EventType = %s, want %s", tt.eventType, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterError_Error(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
err *ClusterError
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
name: "message only",
|
||||
err: &ClusterError{Message: "something failed"},
|
||||
expected: "something failed",
|
||||
},
|
||||
{
|
||||
name: "message with cause",
|
||||
err: &ClusterError{Message: "operation failed", Cause: errors.New("connection timeout")},
|
||||
expected: "operation failed: connection timeout",
|
||||
},
|
||||
{
|
||||
name: "empty message with cause",
|
||||
err: &ClusterError{Message: "", Cause: errors.New("cause")},
|
||||
expected: ": cause",
|
||||
},
|
||||
{
|
||||
name: "empty message no cause",
|
||||
err: &ClusterError{Message: ""},
|
||||
expected: "",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := tt.err.Error()
|
||||
if result != tt.expected {
|
||||
t.Errorf("Error() = %q, want %q", result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterError_Unwrap(t *testing.T) {
|
||||
cause := errors.New("original error")
|
||||
err := &ClusterError{
|
||||
Message: "wrapped",
|
||||
Cause: cause,
|
||||
}
|
||||
|
||||
unwrapped := err.Unwrap()
|
||||
if unwrapped != cause {
|
||||
t.Errorf("Unwrap() = %v, want %v", unwrapped, cause)
|
||||
}
|
||||
|
||||
// Test with no cause
|
||||
errNoCause := &ClusterError{Message: "no cause"}
|
||||
if errNoCause.Unwrap() != nil {
|
||||
t.Errorf("Unwrap() with no cause should return nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestPredefinedErrors(t *testing.T) {
|
||||
// Test that predefined errors have the correct messages
|
||||
tests := []struct {
|
||||
name string
|
||||
err *ClusterError
|
||||
expected string
|
||||
}{
|
||||
{"ErrNoPortsAvailable", ErrNoPortsAvailable, "no ports available on node"},
|
||||
{"ErrNodeAtCapacity", ErrNodeAtCapacity, "node has reached maximum namespace instances"},
|
||||
{"ErrInsufficientNodes", ErrInsufficientNodes, "insufficient nodes available for cluster"},
|
||||
{"ErrClusterNotFound", ErrClusterNotFound, "namespace cluster not found"},
|
||||
{"ErrClusterAlreadyExists", ErrClusterAlreadyExists, "namespace cluster already exists"},
|
||||
{"ErrProvisioningFailed", ErrProvisioningFailed, "cluster provisioning failed"},
|
||||
{"ErrNamespaceNotFound", ErrNamespaceNotFound, "namespace not found"},
|
||||
{"ErrInvalidClusterStatus", ErrInvalidClusterStatus, "invalid cluster status for operation"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if tt.err.Message != tt.expected {
|
||||
t.Errorf("%s.Message = %q, want %q", tt.name, tt.err.Message, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNamespaceCluster_Struct(t *testing.T) {
|
||||
now := time.Now()
|
||||
readyAt := now.Add(5 * time.Minute)
|
||||
|
||||
cluster := &NamespaceCluster{
|
||||
ID: "cluster-123",
|
||||
NamespaceID: 42,
|
||||
NamespaceName: "test-namespace",
|
||||
Status: ClusterStatusReady,
|
||||
RQLiteNodeCount: 3,
|
||||
OlricNodeCount: 3,
|
||||
GatewayNodeCount: 3,
|
||||
ProvisionedBy: "admin",
|
||||
ProvisionedAt: now,
|
||||
ReadyAt: &readyAt,
|
||||
LastHealthCheck: nil,
|
||||
ErrorMessage: "",
|
||||
RetryCount: 0,
|
||||
Nodes: nil,
|
||||
}
|
||||
|
||||
if cluster.ID != "cluster-123" {
|
||||
t.Errorf("ID = %s, want cluster-123", cluster.ID)
|
||||
}
|
||||
if cluster.NamespaceID != 42 {
|
||||
t.Errorf("NamespaceID = %d, want 42", cluster.NamespaceID)
|
||||
}
|
||||
if cluster.Status != ClusterStatusReady {
|
||||
t.Errorf("Status = %s, want %s", cluster.Status, ClusterStatusReady)
|
||||
}
|
||||
if cluster.RQLiteNodeCount != 3 {
|
||||
t.Errorf("RQLiteNodeCount = %d, want 3", cluster.RQLiteNodeCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterNode_Struct(t *testing.T) {
|
||||
now := time.Now()
|
||||
heartbeat := now.Add(-30 * time.Second)
|
||||
|
||||
node := &ClusterNode{
|
||||
ID: "node-record-123",
|
||||
NamespaceClusterID: "cluster-456",
|
||||
NodeID: "12D3KooWabc123",
|
||||
Role: NodeRoleRQLiteLeader,
|
||||
RQLiteHTTPPort: 10000,
|
||||
RQLiteRaftPort: 10001,
|
||||
OlricHTTPPort: 10002,
|
||||
OlricMemberlistPort: 10003,
|
||||
GatewayHTTPPort: 10004,
|
||||
Status: NodeStatusRunning,
|
||||
ProcessPID: 12345,
|
||||
LastHeartbeat: &heartbeat,
|
||||
ErrorMessage: "",
|
||||
RQLiteJoinAddress: "192.168.1.100:10001",
|
||||
OlricPeers: `["192.168.1.100:10003","192.168.1.101:10003"]`,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
}
|
||||
|
||||
if node.Role != NodeRoleRQLiteLeader {
|
||||
t.Errorf("Role = %s, want %s", node.Role, NodeRoleRQLiteLeader)
|
||||
}
|
||||
if node.Status != NodeStatusRunning {
|
||||
t.Errorf("Status = %s, want %s", node.Status, NodeStatusRunning)
|
||||
}
|
||||
if node.RQLiteHTTPPort != 10000 {
|
||||
t.Errorf("RQLiteHTTPPort = %d, want 10000", node.RQLiteHTTPPort)
|
||||
}
|
||||
if node.ProcessPID != 12345 {
|
||||
t.Errorf("ProcessPID = %d, want 12345", node.ProcessPID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterProvisioningStatus_Struct(t *testing.T) {
|
||||
now := time.Now()
|
||||
readyAt := now.Add(2 * time.Minute)
|
||||
|
||||
status := &ClusterProvisioningStatus{
|
||||
ClusterID: "cluster-789",
|
||||
Namespace: "my-namespace",
|
||||
Status: ClusterStatusProvisioning,
|
||||
Nodes: []string{"node-1", "node-2", "node-3"},
|
||||
RQLiteReady: true,
|
||||
OlricReady: true,
|
||||
GatewayReady: false,
|
||||
DNSReady: false,
|
||||
Error: "",
|
||||
CreatedAt: now,
|
||||
ReadyAt: &readyAt,
|
||||
}
|
||||
|
||||
if status.ClusterID != "cluster-789" {
|
||||
t.Errorf("ClusterID = %s, want cluster-789", status.ClusterID)
|
||||
}
|
||||
if len(status.Nodes) != 3 {
|
||||
t.Errorf("len(Nodes) = %d, want 3", len(status.Nodes))
|
||||
}
|
||||
if !status.RQLiteReady {
|
||||
t.Error("RQLiteReady should be true")
|
||||
}
|
||||
if status.GatewayReady {
|
||||
t.Error("GatewayReady should be false")
|
||||
}
|
||||
}
|
||||
|
||||
func TestProvisioningResponse_Struct(t *testing.T) {
|
||||
resp := &ProvisioningResponse{
|
||||
Status: "provisioning",
|
||||
ClusterID: "cluster-abc",
|
||||
PollURL: "/v1/namespace/status?id=cluster-abc",
|
||||
EstimatedTimeSeconds: 120,
|
||||
}
|
||||
|
||||
if resp.Status != "provisioning" {
|
||||
t.Errorf("Status = %s, want provisioning", resp.Status)
|
||||
}
|
||||
if resp.ClusterID != "cluster-abc" {
|
||||
t.Errorf("ClusterID = %s, want cluster-abc", resp.ClusterID)
|
||||
}
|
||||
if resp.EstimatedTimeSeconds != 120 {
|
||||
t.Errorf("EstimatedTimeSeconds = %d, want 120", resp.EstimatedTimeSeconds)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterEvent_Struct(t *testing.T) {
|
||||
now := time.Now()
|
||||
|
||||
event := &ClusterEvent{
|
||||
ID: "event-123",
|
||||
NamespaceClusterID: "cluster-456",
|
||||
EventType: EventClusterReady,
|
||||
NodeID: "node-1",
|
||||
Message: "Cluster is now ready",
|
||||
Metadata: `{"nodes":["node-1","node-2","node-3"]}`,
|
||||
CreatedAt: now,
|
||||
}
|
||||
|
||||
if event.EventType != EventClusterReady {
|
||||
t.Errorf("EventType = %s, want %s", event.EventType, EventClusterReady)
|
||||
}
|
||||
if event.Message != "Cluster is now ready" {
|
||||
t.Errorf("Message = %s, want 'Cluster is now ready'", event.Message)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPortBlock_Struct(t *testing.T) {
|
||||
now := time.Now()
|
||||
|
||||
block := &PortBlock{
|
||||
ID: "port-block-123",
|
||||
NodeID: "node-456",
|
||||
NamespaceClusterID: "cluster-789",
|
||||
PortStart: 10000,
|
||||
PortEnd: 10004,
|
||||
RQLiteHTTPPort: 10000,
|
||||
RQLiteRaftPort: 10001,
|
||||
OlricHTTPPort: 10002,
|
||||
OlricMemberlistPort: 10003,
|
||||
GatewayHTTPPort: 10004,
|
||||
AllocatedAt: now,
|
||||
}
|
||||
|
||||
// Verify port calculations
|
||||
if block.PortEnd-block.PortStart+1 != PortsPerNamespace {
|
||||
t.Errorf("Port range size = %d, want %d", block.PortEnd-block.PortStart+1, PortsPerNamespace)
|
||||
}
|
||||
|
||||
// Verify each port is within the block
|
||||
ports := []int{
|
||||
block.RQLiteHTTPPort,
|
||||
block.RQLiteRaftPort,
|
||||
block.OlricHTTPPort,
|
||||
block.OlricMemberlistPort,
|
||||
block.GatewayHTTPPort,
|
||||
}
|
||||
|
||||
for i, port := range ports {
|
||||
if port < block.PortStart || port > block.PortEnd {
|
||||
t.Errorf("Port %d (%d) is outside block range [%d, %d]",
|
||||
i, port, block.PortStart, block.PortEnd)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestErrorsImplementError(t *testing.T) {
|
||||
// Verify ClusterError implements error interface
|
||||
var _ error = &ClusterError{}
|
||||
|
||||
err := &ClusterError{Message: "test error"}
|
||||
var errInterface error = err
|
||||
|
||||
if errInterface.Error() != "test error" {
|
||||
t.Errorf("error interface Error() = %s, want 'test error'", errInterface.Error())
|
||||
}
|
||||
}
|
||||
|
||||
func TestErrorsUnwrap(t *testing.T) {
|
||||
// Test errors.Is/errors.As compatibility
|
||||
cause := errors.New("root cause")
|
||||
err := &ClusterError{
|
||||
Message: "wrapper",
|
||||
Cause: cause,
|
||||
}
|
||||
|
||||
if !errors.Is(err, cause) {
|
||||
t.Error("errors.Is should find the wrapped cause")
|
||||
}
|
||||
|
||||
// Test unwrap chain
|
||||
unwrapped := errors.Unwrap(err)
|
||||
if unwrapped != cause {
|
||||
t.Error("errors.Unwrap should return the cause")
|
||||
}
|
||||
}
|
||||
488
pkg/olric/instance_spawner.go
Normal file
488
pkg/olric/instance_spawner.go
Normal file
@ -0,0 +1,488 @@
|
||||
package olric
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/tlsutil"
|
||||
"go.uber.org/zap"
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// InstanceNodeStatus represents the status of an instance (local type to avoid import cycle)
|
||||
type InstanceNodeStatus string
|
||||
|
||||
const (
|
||||
InstanceStatusPending InstanceNodeStatus = "pending"
|
||||
InstanceStatusStarting InstanceNodeStatus = "starting"
|
||||
InstanceStatusRunning InstanceNodeStatus = "running"
|
||||
InstanceStatusStopped InstanceNodeStatus = "stopped"
|
||||
InstanceStatusFailed InstanceNodeStatus = "failed"
|
||||
)
|
||||
|
||||
// InstanceError represents an error during instance operations (local type to avoid import cycle)
|
||||
type InstanceError struct {
|
||||
Message string
|
||||
Cause error
|
||||
}
|
||||
|
||||
func (e *InstanceError) Error() string {
|
||||
if e.Cause != nil {
|
||||
return e.Message + ": " + e.Cause.Error()
|
||||
}
|
||||
return e.Message
|
||||
}
|
||||
|
||||
func (e *InstanceError) Unwrap() error {
|
||||
return e.Cause
|
||||
}
|
||||
|
||||
// InstanceSpawner manages multiple Olric instances for namespace clusters.
|
||||
// Each namespace gets its own Olric cluster with dedicated ports and memberlist.
|
||||
type InstanceSpawner struct {
|
||||
logger *zap.Logger
|
||||
baseDir string // Base directory for all namespace data (e.g., ~/.orama/data/namespaces)
|
||||
instances map[string]*OlricInstance
|
||||
mu sync.RWMutex
|
||||
}
|
||||
|
||||
// OlricInstance represents a running Olric instance for a namespace
|
||||
type OlricInstance struct {
|
||||
Namespace string
|
||||
NodeID string
|
||||
HTTPPort int
|
||||
MemberlistPort int
|
||||
BindAddr string
|
||||
AdvertiseAddr string
|
||||
PeerAddresses []string // Memberlist peer addresses for cluster discovery
|
||||
ConfigPath string
|
||||
DataDir string
|
||||
PID int
|
||||
Status InstanceNodeStatus
|
||||
StartedAt time.Time
|
||||
LastHealthCheck time.Time
|
||||
cmd *exec.Cmd
|
||||
logger *zap.Logger
|
||||
}
|
||||
|
||||
// InstanceConfig holds configuration for spawning an Olric instance
|
||||
type InstanceConfig struct {
|
||||
Namespace string // Namespace name (e.g., "alice")
|
||||
NodeID string // Physical node ID
|
||||
HTTPPort int // HTTP API port
|
||||
MemberlistPort int // Memberlist gossip port
|
||||
BindAddr string // Address to bind (e.g., "0.0.0.0")
|
||||
AdvertiseAddr string // Address to advertise (e.g., "192.168.1.10")
|
||||
PeerAddresses []string // Memberlist peer addresses for initial cluster join
|
||||
}
|
||||
|
||||
// OlricConfig represents the Olric YAML configuration structure
|
||||
type OlricConfig struct {
|
||||
Server OlricServerConfig `yaml:"server"`
|
||||
Memberlist OlricMemberlistConfig `yaml:"memberlist"`
|
||||
}
|
||||
|
||||
// OlricServerConfig represents the server section of Olric config
|
||||
type OlricServerConfig struct {
|
||||
BindAddr string `yaml:"bindAddr"`
|
||||
BindPort int `yaml:"bindPort"`
|
||||
}
|
||||
|
||||
// OlricMemberlistConfig represents the memberlist section of Olric config
|
||||
type OlricMemberlistConfig struct {
|
||||
Environment string `yaml:"environment"`
|
||||
BindAddr string `yaml:"bindAddr"`
|
||||
BindPort int `yaml:"bindPort"`
|
||||
Peers []string `yaml:"peers,omitempty"`
|
||||
}
|
||||
|
||||
// NewInstanceSpawner creates a new Olric instance spawner
|
||||
func NewInstanceSpawner(baseDir string, logger *zap.Logger) *InstanceSpawner {
|
||||
return &InstanceSpawner{
|
||||
logger: logger.With(zap.String("component", "olric-instance-spawner")),
|
||||
baseDir: baseDir,
|
||||
instances: make(map[string]*OlricInstance),
|
||||
}
|
||||
}
|
||||
|
||||
// instanceKey generates a unique key for an instance based on namespace and node
|
||||
func instanceKey(namespace, nodeID string) string {
|
||||
return fmt.Sprintf("%s:%s", namespace, nodeID)
|
||||
}
|
||||
|
||||
// SpawnInstance starts a new Olric instance for a namespace on a specific node.
|
||||
// Returns the instance info or an error if spawning fails.
|
||||
func (is *InstanceSpawner) SpawnInstance(ctx context.Context, cfg InstanceConfig) (*OlricInstance, error) {
|
||||
key := instanceKey(cfg.Namespace, cfg.NodeID)
|
||||
|
||||
is.mu.Lock()
|
||||
if existing, ok := is.instances[key]; ok {
|
||||
is.mu.Unlock()
|
||||
// Instance already exists, return it if running
|
||||
if existing.Status == InstanceStatusRunning {
|
||||
return existing, nil
|
||||
}
|
||||
// Otherwise, remove it and start fresh
|
||||
is.mu.Lock()
|
||||
delete(is.instances, key)
|
||||
}
|
||||
is.mu.Unlock()
|
||||
|
||||
// Create data and config directories
|
||||
dataDir := filepath.Join(is.baseDir, cfg.Namespace, "olric", cfg.NodeID)
|
||||
configDir := filepath.Join(is.baseDir, cfg.Namespace, "configs")
|
||||
logsDir := filepath.Join(is.baseDir, cfg.Namespace, "logs")
|
||||
|
||||
for _, dir := range []string{dataDir, configDir, logsDir} {
|
||||
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||
return nil, &InstanceError{
|
||||
Message: fmt.Sprintf("failed to create directory %s", dir),
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Generate config file
|
||||
configPath := filepath.Join(configDir, fmt.Sprintf("olric-%s.yaml", cfg.NodeID))
|
||||
if err := is.generateConfig(configPath, cfg); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
instance := &OlricInstance{
|
||||
Namespace: cfg.Namespace,
|
||||
NodeID: cfg.NodeID,
|
||||
HTTPPort: cfg.HTTPPort,
|
||||
MemberlistPort: cfg.MemberlistPort,
|
||||
BindAddr: cfg.BindAddr,
|
||||
AdvertiseAddr: cfg.AdvertiseAddr,
|
||||
PeerAddresses: cfg.PeerAddresses,
|
||||
ConfigPath: configPath,
|
||||
DataDir: dataDir,
|
||||
Status: InstanceStatusStarting,
|
||||
logger: is.logger.With(zap.String("namespace", cfg.Namespace), zap.String("node_id", cfg.NodeID)),
|
||||
}
|
||||
|
||||
instance.logger.Info("Starting Olric instance",
|
||||
zap.Int("http_port", cfg.HTTPPort),
|
||||
zap.Int("memberlist_port", cfg.MemberlistPort),
|
||||
zap.Strings("peers", cfg.PeerAddresses),
|
||||
)
|
||||
|
||||
// Create command with config environment variable
|
||||
cmd := exec.CommandContext(ctx, "olric-server")
|
||||
cmd.Env = append(os.Environ(), fmt.Sprintf("OLRIC_SERVER_CONFIG=%s", configPath))
|
||||
instance.cmd = cmd
|
||||
|
||||
// Setup logging
|
||||
logPath := filepath.Join(logsDir, fmt.Sprintf("olric-%s.log", cfg.NodeID))
|
||||
logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
|
||||
if err != nil {
|
||||
return nil, &InstanceError{
|
||||
Message: "failed to open log file",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
cmd.Stdout = logFile
|
||||
cmd.Stderr = logFile
|
||||
|
||||
// Start the process
|
||||
if err := cmd.Start(); err != nil {
|
||||
logFile.Close()
|
||||
return nil, &InstanceError{
|
||||
Message: "failed to start Olric process",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
logFile.Close()
|
||||
|
||||
instance.PID = cmd.Process.Pid
|
||||
instance.StartedAt = time.Now()
|
||||
|
||||
// Store instance
|
||||
is.mu.Lock()
|
||||
is.instances[key] = instance
|
||||
is.mu.Unlock()
|
||||
|
||||
// Wait for instance to be ready
|
||||
if err := is.waitForInstanceReady(ctx, instance); err != nil {
|
||||
// Kill the process on failure
|
||||
if cmd.Process != nil {
|
||||
_ = cmd.Process.Kill()
|
||||
}
|
||||
is.mu.Lock()
|
||||
delete(is.instances, key)
|
||||
is.mu.Unlock()
|
||||
return nil, &InstanceError{
|
||||
Message: "Olric instance did not become ready",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
instance.Status = InstanceStatusRunning
|
||||
instance.LastHealthCheck = time.Now()
|
||||
|
||||
instance.logger.Info("Olric instance started successfully",
|
||||
zap.Int("pid", instance.PID),
|
||||
)
|
||||
|
||||
// Start background process monitor
|
||||
go is.monitorInstance(instance)
|
||||
|
||||
return instance, nil
|
||||
}
|
||||
|
||||
// generateConfig generates the Olric YAML configuration file
|
||||
func (is *InstanceSpawner) generateConfig(configPath string, cfg InstanceConfig) error {
|
||||
// Use "lan" environment for namespace clusters (low latency expected)
|
||||
olricCfg := OlricConfig{
|
||||
Server: OlricServerConfig{
|
||||
BindAddr: cfg.BindAddr,
|
||||
BindPort: cfg.HTTPPort,
|
||||
},
|
||||
Memberlist: OlricMemberlistConfig{
|
||||
Environment: "lan",
|
||||
BindAddr: cfg.BindAddr,
|
||||
BindPort: cfg.MemberlistPort,
|
||||
Peers: cfg.PeerAddresses,
|
||||
},
|
||||
}
|
||||
|
||||
data, err := yaml.Marshal(olricCfg)
|
||||
if err != nil {
|
||||
return &InstanceError{
|
||||
Message: "failed to marshal Olric config",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
if err := os.WriteFile(configPath, data, 0644); err != nil {
|
||||
return &InstanceError{
|
||||
Message: "failed to write Olric config",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// StopInstance stops an Olric instance for a namespace on a specific node
|
||||
func (is *InstanceSpawner) StopInstance(ctx context.Context, ns, nodeID string) error {
|
||||
key := instanceKey(ns, nodeID)
|
||||
|
||||
is.mu.Lock()
|
||||
instance, ok := is.instances[key]
|
||||
if !ok {
|
||||
is.mu.Unlock()
|
||||
return nil // Already stopped
|
||||
}
|
||||
delete(is.instances, key)
|
||||
is.mu.Unlock()
|
||||
|
||||
if instance.cmd != nil && instance.cmd.Process != nil {
|
||||
instance.logger.Info("Stopping Olric instance", zap.Int("pid", instance.PID))
|
||||
|
||||
// Send SIGTERM for graceful shutdown
|
||||
if err := instance.cmd.Process.Signal(os.Interrupt); err != nil {
|
||||
// If SIGTERM fails, kill it
|
||||
_ = instance.cmd.Process.Kill()
|
||||
}
|
||||
|
||||
// Wait for process to exit with timeout
|
||||
done := make(chan error, 1)
|
||||
go func() {
|
||||
done <- instance.cmd.Wait()
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-done:
|
||||
instance.logger.Info("Olric instance stopped gracefully")
|
||||
case <-time.After(10 * time.Second):
|
||||
instance.logger.Warn("Olric instance did not stop gracefully, killing")
|
||||
_ = instance.cmd.Process.Kill()
|
||||
case <-ctx.Done():
|
||||
_ = instance.cmd.Process.Kill()
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
|
||||
instance.Status = InstanceStatusStopped
|
||||
return nil
|
||||
}
|
||||
|
||||
// StopAllInstances stops all Olric instances for a namespace
|
||||
func (is *InstanceSpawner) StopAllInstances(ctx context.Context, ns string) error {
|
||||
is.mu.RLock()
|
||||
var keys []string
|
||||
for key, inst := range is.instances {
|
||||
if inst.Namespace == ns {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
}
|
||||
is.mu.RUnlock()
|
||||
|
||||
var lastErr error
|
||||
for _, key := range keys {
|
||||
parts := strings.SplitN(key, ":", 2)
|
||||
if len(parts) == 2 {
|
||||
if err := is.StopInstance(ctx, parts[0], parts[1]); err != nil {
|
||||
lastErr = err
|
||||
}
|
||||
}
|
||||
}
|
||||
return lastErr
|
||||
}
|
||||
|
||||
// GetInstance returns the instance for a namespace on a specific node
|
||||
func (is *InstanceSpawner) GetInstance(ns, nodeID string) (*OlricInstance, bool) {
|
||||
is.mu.RLock()
|
||||
defer is.mu.RUnlock()
|
||||
|
||||
instance, ok := is.instances[instanceKey(ns, nodeID)]
|
||||
return instance, ok
|
||||
}
|
||||
|
||||
// GetNamespaceInstances returns all instances for a namespace
|
||||
func (is *InstanceSpawner) GetNamespaceInstances(ns string) []*OlricInstance {
|
||||
is.mu.RLock()
|
||||
defer is.mu.RUnlock()
|
||||
|
||||
var instances []*OlricInstance
|
||||
for _, inst := range is.instances {
|
||||
if inst.Namespace == ns {
|
||||
instances = append(instances, inst)
|
||||
}
|
||||
}
|
||||
return instances
|
||||
}
|
||||
|
||||
// HealthCheck checks if an instance is healthy
|
||||
func (is *InstanceSpawner) HealthCheck(ctx context.Context, ns, nodeID string) (bool, error) {
|
||||
instance, ok := is.GetInstance(ns, nodeID)
|
||||
if !ok {
|
||||
return false, &InstanceError{Message: "instance not found"}
|
||||
}
|
||||
|
||||
healthy, err := instance.IsHealthy(ctx)
|
||||
if healthy {
|
||||
is.mu.Lock()
|
||||
instance.LastHealthCheck = time.Now()
|
||||
is.mu.Unlock()
|
||||
}
|
||||
return healthy, err
|
||||
}
|
||||
|
||||
// waitForInstanceReady waits for the Olric instance to be ready
|
||||
func (is *InstanceSpawner) waitForInstanceReady(ctx context.Context, instance *OlricInstance) error {
|
||||
client := tlsutil.NewHTTPClient(2 * time.Second)
|
||||
|
||||
// Olric health check endpoint
|
||||
url := fmt.Sprintf("http://localhost:%d/ready", instance.HTTPPort)
|
||||
|
||||
maxAttempts := 120 // 2 minutes
|
||||
for i := 0; i < maxAttempts; i++ {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-time.After(1 * time.Second):
|
||||
}
|
||||
|
||||
resp, err := client.Get(url)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
resp.Body.Close()
|
||||
|
||||
if resp.StatusCode == http.StatusOK {
|
||||
instance.logger.Debug("Olric instance ready",
|
||||
zap.Int("attempts", i+1),
|
||||
)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
return fmt.Errorf("Olric did not become ready within timeout")
|
||||
}
|
||||
|
||||
// monitorInstance monitors an instance and updates its status
|
||||
func (is *InstanceSpawner) monitorInstance(instance *OlricInstance) {
|
||||
ticker := time.NewTicker(30 * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for range ticker.C {
|
||||
is.mu.RLock()
|
||||
key := instanceKey(instance.Namespace, instance.NodeID)
|
||||
_, exists := is.instances[key]
|
||||
is.mu.RUnlock()
|
||||
|
||||
if !exists {
|
||||
// Instance was removed
|
||||
return
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
healthy, _ := instance.IsHealthy(ctx)
|
||||
cancel()
|
||||
|
||||
is.mu.Lock()
|
||||
if healthy {
|
||||
instance.Status = InstanceStatusRunning
|
||||
instance.LastHealthCheck = time.Now()
|
||||
} else {
|
||||
instance.Status = InstanceStatusFailed
|
||||
instance.logger.Warn("Olric instance health check failed")
|
||||
}
|
||||
is.mu.Unlock()
|
||||
|
||||
// Check if process is still running
|
||||
if instance.cmd != nil && instance.cmd.ProcessState != nil && instance.cmd.ProcessState.Exited() {
|
||||
is.mu.Lock()
|
||||
instance.Status = InstanceStatusStopped
|
||||
is.mu.Unlock()
|
||||
instance.logger.Warn("Olric instance process exited unexpectedly")
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// IsHealthy checks if the Olric instance is healthy
|
||||
func (oi *OlricInstance) IsHealthy(ctx context.Context) (bool, error) {
|
||||
url := fmt.Sprintf("http://localhost:%d/ready", oi.HTTPPort)
|
||||
client := tlsutil.NewHTTPClient(5 * time.Second)
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
return resp.StatusCode == http.StatusOK, nil
|
||||
}
|
||||
|
||||
// DSN returns the connection address for this Olric instance
|
||||
func (oi *OlricInstance) DSN() string {
|
||||
return fmt.Sprintf("localhost:%d", oi.HTTPPort)
|
||||
}
|
||||
|
||||
// AdvertisedDSN returns the advertised connection address
|
||||
func (oi *OlricInstance) AdvertisedDSN() string {
|
||||
return fmt.Sprintf("%s:%d", oi.AdvertiseAddr, oi.HTTPPort)
|
||||
}
|
||||
|
||||
// MemberlistAddress returns the memberlist address for cluster communication
|
||||
func (oi *OlricInstance) MemberlistAddress() string {
|
||||
return fmt.Sprintf("%s:%d", oi.AdvertiseAddr, oi.MemberlistPort)
|
||||
}
|
||||
586
pkg/rqlite/instance_spawner.go
Normal file
586
pkg/rqlite/instance_spawner.go
Normal file
@ -0,0 +1,586 @@
|
||||
package rqlite
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/tlsutil"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// InstanceNodeStatus represents the status of an instance (local type to avoid import cycle)
|
||||
type InstanceNodeStatus string
|
||||
|
||||
const (
|
||||
InstanceStatusPending InstanceNodeStatus = "pending"
|
||||
InstanceStatusStarting InstanceNodeStatus = "starting"
|
||||
InstanceStatusRunning InstanceNodeStatus = "running"
|
||||
InstanceStatusStopped InstanceNodeStatus = "stopped"
|
||||
InstanceStatusFailed InstanceNodeStatus = "failed"
|
||||
)
|
||||
|
||||
// InstanceError represents an error during instance operations (local type to avoid import cycle)
|
||||
type InstanceError struct {
|
||||
Message string
|
||||
Cause error
|
||||
}
|
||||
|
||||
func (e *InstanceError) Error() string {
|
||||
if e.Cause != nil {
|
||||
return e.Message + ": " + e.Cause.Error()
|
||||
}
|
||||
return e.Message
|
||||
}
|
||||
|
||||
func (e *InstanceError) Unwrap() error {
|
||||
return e.Cause
|
||||
}
|
||||
|
||||
// InstanceSpawner manages multiple RQLite instances for namespace clusters.
|
||||
// Each namespace gets its own RQLite cluster with dedicated ports and data directories.
|
||||
type InstanceSpawner struct {
|
||||
logger *zap.Logger
|
||||
baseDir string // Base directory for all namespace data (e.g., ~/.orama/data/namespaces)
|
||||
instances map[string]*RQLiteInstance
|
||||
mu sync.RWMutex
|
||||
}
|
||||
|
||||
// RQLiteInstance represents a running RQLite instance for a namespace
|
||||
type RQLiteInstance struct {
|
||||
Namespace string
|
||||
NodeID string
|
||||
HTTPPort int
|
||||
RaftPort int
|
||||
HTTPAdvAddress string
|
||||
RaftAdvAddress string
|
||||
JoinAddresses []string
|
||||
DataDir string
|
||||
IsLeader bool
|
||||
PID int
|
||||
Status InstanceNodeStatus
|
||||
StartedAt time.Time
|
||||
LastHealthCheck time.Time
|
||||
cmd *exec.Cmd
|
||||
logger *zap.Logger
|
||||
}
|
||||
|
||||
// InstanceConfig holds configuration for spawning an RQLite instance
|
||||
type InstanceConfig struct {
|
||||
Namespace string // Namespace name (e.g., "alice")
|
||||
NodeID string // Physical node ID
|
||||
HTTPPort int // HTTP API port
|
||||
RaftPort int // Raft consensus port
|
||||
HTTPAdvAddress string // Advertised HTTP address (e.g., "192.168.1.10:10000")
|
||||
RaftAdvAddress string // Advertised Raft address (e.g., "192.168.1.10:10001")
|
||||
JoinAddresses []string // Addresses of existing cluster members to join
|
||||
IsLeader bool // Whether this is the initial leader node
|
||||
}
|
||||
|
||||
// NewInstanceSpawner creates a new RQLite instance spawner
|
||||
func NewInstanceSpawner(baseDir string, logger *zap.Logger) *InstanceSpawner {
|
||||
return &InstanceSpawner{
|
||||
logger: logger.With(zap.String("component", "rqlite-instance-spawner")),
|
||||
baseDir: baseDir,
|
||||
instances: make(map[string]*RQLiteInstance),
|
||||
}
|
||||
}
|
||||
|
||||
// instanceKey generates a unique key for an instance based on namespace and node
|
||||
func instanceKey(namespace, nodeID string) string {
|
||||
return fmt.Sprintf("%s:%s", namespace, nodeID)
|
||||
}
|
||||
|
||||
// SpawnInstance starts a new RQLite instance for a namespace on a specific node.
|
||||
// Returns the instance info or an error if spawning fails.
|
||||
func (is *InstanceSpawner) SpawnInstance(ctx context.Context, cfg InstanceConfig) (*RQLiteInstance, error) {
|
||||
key := instanceKey(cfg.Namespace, cfg.NodeID)
|
||||
|
||||
is.mu.Lock()
|
||||
if existing, ok := is.instances[key]; ok {
|
||||
is.mu.Unlock()
|
||||
// Instance already exists, return it if running
|
||||
if existing.Status == InstanceStatusRunning {
|
||||
return existing, nil
|
||||
}
|
||||
// Otherwise, remove it and start fresh
|
||||
is.mu.Lock()
|
||||
delete(is.instances, key)
|
||||
}
|
||||
is.mu.Unlock()
|
||||
|
||||
// Create data directory
|
||||
dataDir := filepath.Join(is.baseDir, cfg.Namespace, "rqlite", cfg.NodeID)
|
||||
if err := os.MkdirAll(dataDir, 0755); err != nil {
|
||||
return nil, &InstanceError{
|
||||
Message: "failed to create data directory",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
// Create logs directory
|
||||
logsDir := filepath.Join(is.baseDir, cfg.Namespace, "logs")
|
||||
if err := os.MkdirAll(logsDir, 0755); err != nil {
|
||||
return nil, &InstanceError{
|
||||
Message: "failed to create logs directory",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
instance := &RQLiteInstance{
|
||||
Namespace: cfg.Namespace,
|
||||
NodeID: cfg.NodeID,
|
||||
HTTPPort: cfg.HTTPPort,
|
||||
RaftPort: cfg.RaftPort,
|
||||
HTTPAdvAddress: cfg.HTTPAdvAddress,
|
||||
RaftAdvAddress: cfg.RaftAdvAddress,
|
||||
JoinAddresses: cfg.JoinAddresses,
|
||||
DataDir: dataDir,
|
||||
IsLeader: cfg.IsLeader,
|
||||
Status: InstanceStatusStarting,
|
||||
logger: is.logger.With(zap.String("namespace", cfg.Namespace), zap.String("node_id", cfg.NodeID)),
|
||||
}
|
||||
|
||||
// Build command arguments
|
||||
args := []string{
|
||||
"-http-addr", fmt.Sprintf("0.0.0.0:%d", cfg.HTTPPort),
|
||||
"-http-adv-addr", cfg.HTTPAdvAddress,
|
||||
"-raft-addr", fmt.Sprintf("0.0.0.0:%d", cfg.RaftPort),
|
||||
"-raft-adv-addr", cfg.RaftAdvAddress,
|
||||
}
|
||||
|
||||
// Handle cluster joining
|
||||
if len(cfg.JoinAddresses) > 0 && !cfg.IsLeader {
|
||||
// Remove peers.json if it exists to avoid stale cluster state
|
||||
peersJSONPath := filepath.Join(dataDir, "raft", "peers.json")
|
||||
if _, err := os.Stat(peersJSONPath); err == nil {
|
||||
instance.logger.Debug("Removing existing peers.json before joining cluster",
|
||||
zap.String("path", peersJSONPath))
|
||||
_ = os.Remove(peersJSONPath)
|
||||
}
|
||||
|
||||
// Prepare join addresses (strip http:// prefix if present)
|
||||
joinAddrs := make([]string, 0, len(cfg.JoinAddresses))
|
||||
for _, addr := range cfg.JoinAddresses {
|
||||
addr = strings.TrimPrefix(addr, "http://")
|
||||
addr = strings.TrimPrefix(addr, "https://")
|
||||
joinAddrs = append(joinAddrs, addr)
|
||||
}
|
||||
|
||||
// Wait for join targets to be available
|
||||
if err := is.waitForJoinTargets(ctx, cfg.JoinAddresses); err != nil {
|
||||
instance.logger.Warn("Join targets not all reachable, will still attempt join",
|
||||
zap.Error(err))
|
||||
}
|
||||
|
||||
args = append(args,
|
||||
"-join", strings.Join(joinAddrs, ","),
|
||||
"-join-as", cfg.RaftAdvAddress,
|
||||
"-join-attempts", "30",
|
||||
"-join-interval", "10s",
|
||||
)
|
||||
}
|
||||
|
||||
// Add data directory as final argument
|
||||
args = append(args, dataDir)
|
||||
|
||||
instance.logger.Info("Starting RQLite instance",
|
||||
zap.Int("http_port", cfg.HTTPPort),
|
||||
zap.Int("raft_port", cfg.RaftPort),
|
||||
zap.Strings("join_addresses", cfg.JoinAddresses),
|
||||
zap.Bool("is_leader", cfg.IsLeader),
|
||||
)
|
||||
|
||||
// Create command
|
||||
cmd := exec.CommandContext(ctx, "rqlited", args...)
|
||||
instance.cmd = cmd
|
||||
|
||||
// Setup logging
|
||||
logPath := filepath.Join(logsDir, fmt.Sprintf("rqlite-%s.log", cfg.NodeID))
|
||||
logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
|
||||
if err != nil {
|
||||
return nil, &InstanceError{
|
||||
Message: "failed to open log file",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
cmd.Stdout = logFile
|
||||
cmd.Stderr = logFile
|
||||
|
||||
// Start the process
|
||||
if err := cmd.Start(); err != nil {
|
||||
logFile.Close()
|
||||
return nil, &InstanceError{
|
||||
Message: "failed to start RQLite process",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
logFile.Close()
|
||||
|
||||
instance.PID = cmd.Process.Pid
|
||||
instance.StartedAt = time.Now()
|
||||
|
||||
// Store instance
|
||||
is.mu.Lock()
|
||||
is.instances[key] = instance
|
||||
is.mu.Unlock()
|
||||
|
||||
// Wait for instance to be ready
|
||||
if err := is.waitForInstanceReady(ctx, instance); err != nil {
|
||||
// Kill the process on failure
|
||||
if cmd.Process != nil {
|
||||
_ = cmd.Process.Kill()
|
||||
}
|
||||
is.mu.Lock()
|
||||
delete(is.instances, key)
|
||||
is.mu.Unlock()
|
||||
return nil, &InstanceError{
|
||||
Message: "RQLite instance did not become ready",
|
||||
Cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
instance.Status = InstanceStatusRunning
|
||||
instance.LastHealthCheck = time.Now()
|
||||
|
||||
instance.logger.Info("RQLite instance started successfully",
|
||||
zap.Int("pid", instance.PID),
|
||||
)
|
||||
|
||||
// Start background process monitor
|
||||
go is.monitorInstance(instance)
|
||||
|
||||
return instance, nil
|
||||
}
|
||||
|
||||
// StopInstance stops an RQLite instance for a namespace on a specific node
|
||||
func (is *InstanceSpawner) StopInstance(ctx context.Context, namespace, nodeID string) error {
|
||||
key := instanceKey(namespace, nodeID)
|
||||
|
||||
is.mu.Lock()
|
||||
instance, ok := is.instances[key]
|
||||
if !ok {
|
||||
is.mu.Unlock()
|
||||
return nil // Already stopped
|
||||
}
|
||||
delete(is.instances, key)
|
||||
is.mu.Unlock()
|
||||
|
||||
if instance.cmd != nil && instance.cmd.Process != nil {
|
||||
instance.logger.Info("Stopping RQLite instance", zap.Int("pid", instance.PID))
|
||||
|
||||
// Send SIGTERM for graceful shutdown
|
||||
if err := instance.cmd.Process.Signal(os.Interrupt); err != nil {
|
||||
// If SIGTERM fails, kill it
|
||||
_ = instance.cmd.Process.Kill()
|
||||
}
|
||||
|
||||
// Wait for process to exit with timeout
|
||||
done := make(chan error, 1)
|
||||
go func() {
|
||||
done <- instance.cmd.Wait()
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-done:
|
||||
instance.logger.Info("RQLite instance stopped gracefully")
|
||||
case <-time.After(10 * time.Second):
|
||||
instance.logger.Warn("RQLite instance did not stop gracefully, killing")
|
||||
_ = instance.cmd.Process.Kill()
|
||||
case <-ctx.Done():
|
||||
_ = instance.cmd.Process.Kill()
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
|
||||
instance.Status = InstanceStatusStopped
|
||||
return nil
|
||||
}
|
||||
|
||||
// StopAllInstances stops all RQLite instances for a namespace
|
||||
func (is *InstanceSpawner) StopAllInstances(ctx context.Context, ns string) error {
|
||||
is.mu.RLock()
|
||||
var keys []string
|
||||
for key, inst := range is.instances {
|
||||
if inst.Namespace == ns {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
}
|
||||
is.mu.RUnlock()
|
||||
|
||||
var lastErr error
|
||||
for _, key := range keys {
|
||||
parts := strings.SplitN(key, ":", 2)
|
||||
if len(parts) == 2 {
|
||||
if err := is.StopInstance(ctx, parts[0], parts[1]); err != nil {
|
||||
lastErr = err
|
||||
}
|
||||
}
|
||||
}
|
||||
return lastErr
|
||||
}
|
||||
|
||||
// GetInstance returns the instance for a namespace on a specific node
|
||||
func (is *InstanceSpawner) GetInstance(namespace, nodeID string) (*RQLiteInstance, bool) {
|
||||
is.mu.RLock()
|
||||
defer is.mu.RUnlock()
|
||||
|
||||
instance, ok := is.instances[instanceKey(namespace, nodeID)]
|
||||
return instance, ok
|
||||
}
|
||||
|
||||
// GetNamespaceInstances returns all instances for a namespace
|
||||
func (is *InstanceSpawner) GetNamespaceInstances(ns string) []*RQLiteInstance {
|
||||
is.mu.RLock()
|
||||
defer is.mu.RUnlock()
|
||||
|
||||
var instances []*RQLiteInstance
|
||||
for _, inst := range is.instances {
|
||||
if inst.Namespace == ns {
|
||||
instances = append(instances, inst)
|
||||
}
|
||||
}
|
||||
return instances
|
||||
}
|
||||
|
||||
// HealthCheck checks if an instance is healthy
|
||||
func (is *InstanceSpawner) HealthCheck(ctx context.Context, namespace, nodeID string) (bool, error) {
|
||||
instance, ok := is.GetInstance(namespace, nodeID)
|
||||
if !ok {
|
||||
return false, &InstanceError{Message: "instance not found"}
|
||||
}
|
||||
|
||||
healthy, err := instance.IsHealthy(ctx)
|
||||
if healthy {
|
||||
is.mu.Lock()
|
||||
instance.LastHealthCheck = time.Now()
|
||||
is.mu.Unlock()
|
||||
}
|
||||
return healthy, err
|
||||
}
|
||||
|
||||
// waitForJoinTargets waits for join target nodes to be reachable
|
||||
func (is *InstanceSpawner) waitForJoinTargets(ctx context.Context, joinAddresses []string) error {
|
||||
timeout := 2 * time.Minute
|
||||
deadline := time.Now().Add(timeout)
|
||||
client := tlsutil.NewHTTPClient(5 * time.Second)
|
||||
|
||||
for time.Now().Before(deadline) {
|
||||
allReachable := true
|
||||
for _, addr := range joinAddresses {
|
||||
statusURL := addr
|
||||
if !strings.HasPrefix(addr, "http") {
|
||||
statusURL = "http://" + addr
|
||||
}
|
||||
statusURL = strings.TrimRight(statusURL, "/") + "/status"
|
||||
|
||||
resp, err := client.Get(statusURL)
|
||||
if err != nil {
|
||||
allReachable = false
|
||||
break
|
||||
}
|
||||
resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
allReachable = false
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if allReachable {
|
||||
return nil
|
||||
}
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-time.After(2 * time.Second):
|
||||
}
|
||||
}
|
||||
|
||||
return fmt.Errorf("join targets not reachable within timeout")
|
||||
}
|
||||
|
||||
// waitForInstanceReady waits for the RQLite instance to be ready
|
||||
func (is *InstanceSpawner) waitForInstanceReady(ctx context.Context, instance *RQLiteInstance) error {
|
||||
url := fmt.Sprintf("http://localhost:%d/status", instance.HTTPPort)
|
||||
client := tlsutil.NewHTTPClient(2 * time.Second)
|
||||
|
||||
// Longer timeout for joining nodes as they need to sync
|
||||
maxAttempts := 180 // 3 minutes
|
||||
if len(instance.JoinAddresses) > 0 {
|
||||
maxAttempts = 300 // 5 minutes for joiners
|
||||
}
|
||||
|
||||
for i := 0; i < maxAttempts; i++ {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-time.After(1 * time.Second):
|
||||
}
|
||||
|
||||
resp, err := client.Get(url)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if resp.StatusCode == http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
resp.Body.Close()
|
||||
|
||||
var statusResp map[string]interface{}
|
||||
if err := json.Unmarshal(body, &statusResp); err == nil {
|
||||
if raft, ok := statusResp["raft"].(map[string]interface{}); ok {
|
||||
state, _ := raft["state"].(string)
|
||||
if state == "leader" || state == "follower" {
|
||||
instance.logger.Debug("RQLite instance ready",
|
||||
zap.String("state", state),
|
||||
zap.Int("attempts", i+1),
|
||||
)
|
||||
return nil
|
||||
}
|
||||
} else {
|
||||
// Backwards compatibility - if no raft status, consider ready
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
return fmt.Errorf("RQLite did not become ready within timeout")
|
||||
}
|
||||
|
||||
// monitorInstance monitors an instance and updates its status
|
||||
func (is *InstanceSpawner) monitorInstance(instance *RQLiteInstance) {
|
||||
ticker := time.NewTicker(30 * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for range ticker.C {
|
||||
is.mu.RLock()
|
||||
key := instanceKey(instance.Namespace, instance.NodeID)
|
||||
_, exists := is.instances[key]
|
||||
is.mu.RUnlock()
|
||||
|
||||
if !exists {
|
||||
// Instance was removed
|
||||
return
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
healthy, _ := instance.IsHealthy(ctx)
|
||||
cancel()
|
||||
|
||||
is.mu.Lock()
|
||||
if healthy {
|
||||
instance.Status = InstanceStatusRunning
|
||||
instance.LastHealthCheck = time.Now()
|
||||
} else {
|
||||
instance.Status = InstanceStatusFailed
|
||||
instance.logger.Warn("RQLite instance health check failed")
|
||||
}
|
||||
is.mu.Unlock()
|
||||
|
||||
// Check if process is still running
|
||||
if instance.cmd != nil && instance.cmd.ProcessState != nil && instance.cmd.ProcessState.Exited() {
|
||||
is.mu.Lock()
|
||||
instance.Status = InstanceStatusStopped
|
||||
is.mu.Unlock()
|
||||
instance.logger.Warn("RQLite instance process exited unexpectedly")
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// IsHealthy checks if the RQLite instance is healthy
|
||||
func (ri *RQLiteInstance) IsHealthy(ctx context.Context) (bool, error) {
|
||||
url := fmt.Sprintf("http://localhost:%d/status", ri.HTTPPort)
|
||||
client := tlsutil.NewHTTPClient(5 * time.Second)
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return false, fmt.Errorf("status endpoint returned %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
var statusResp map[string]interface{}
|
||||
if err := json.Unmarshal(body, &statusResp); err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
if raft, ok := statusResp["raft"].(map[string]interface{}); ok {
|
||||
state, _ := raft["state"].(string)
|
||||
return state == "leader" || state == "follower", nil
|
||||
}
|
||||
|
||||
// Backwards compatibility
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// GetLeaderAddress returns the leader's address for the cluster
|
||||
func (ri *RQLiteInstance) GetLeaderAddress(ctx context.Context) (string, error) {
|
||||
url := fmt.Sprintf("http://localhost:%d/status", ri.HTTPPort)
|
||||
client := tlsutil.NewHTTPClient(5 * time.Second)
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
var statusResp map[string]interface{}
|
||||
if err := json.Unmarshal(body, &statusResp); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if raft, ok := statusResp["raft"].(map[string]interface{}); ok {
|
||||
if leader, ok := raft["leader_addr"].(string); ok {
|
||||
return leader, nil
|
||||
}
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("leader address not found in status response")
|
||||
}
|
||||
|
||||
// DSN returns the connection string for this RQLite instance
|
||||
func (ri *RQLiteInstance) DSN() string {
|
||||
return fmt.Sprintf("http://localhost:%d", ri.HTTPPort)
|
||||
}
|
||||
|
||||
// AdvertisedDSN returns the advertised connection string for cluster communication
|
||||
func (ri *RQLiteInstance) AdvertisedDSN() string {
|
||||
return fmt.Sprintf("http://%s", ri.HTTPAdvAddress)
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user