namespaces on gateway, load balancer and rqlite and olric namespaces

This commit is contained in:
anonpenguin23 2026-01-28 11:24:21 +02:00
parent 468ca06398
commit edd9c1f3dc
23 changed files with 6335 additions and 23 deletions

2
.gitignore vendored
View File

@ -93,3 +93,5 @@ orama-cli-linux
rnd/ rnd/
keys_backup/ keys_backup/
vps.txt

View File

@ -0,0 +1,391 @@
//go:build e2e
package e2e
import (
"encoding/json"
"fmt"
"io"
"net/http"
"path/filepath"
"strings"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestNamespaceCluster_Provisioning tests that creating a new namespace
// triggers cluster provisioning with 202 Accepted response
func TestNamespaceCluster_Provisioning(t *testing.T) {
if !IsProductionMode() {
t.Skip("Namespace cluster provisioning only applies in production mode")
}
// This test requires a completely new namespace to trigger provisioning
newNamespace := fmt.Sprintf("test-ns-%d", time.Now().UnixNano())
env, err := LoadTestEnvWithNamespace(newNamespace)
require.NoError(t, err, "Should create test environment")
t.Run("New namespace triggers provisioning", func(t *testing.T) {
// If we got here with an API key, provisioning either completed or was not required
// The LoadTestEnvWithNamespace function handles the provisioning flow
require.NotEmpty(t, env.APIKey, "Should have received API key after provisioning")
t.Logf("Namespace %s provisioned successfully", newNamespace)
})
t.Run("Namespace gateway is accessible", func(t *testing.T) {
// Try to access the namespace gateway
// The URL should be ns-{namespace}.{baseDomain}
cfg, _ := LoadE2EConfig()
if cfg.BaseDomain == "" {
cfg.BaseDomain = "devnet-orama.network"
}
nsGatewayURL := fmt.Sprintf("https://ns-%s.%s", newNamespace, cfg.BaseDomain)
req, _ := http.NewRequest("GET", nsGatewayURL+"/v1/health", nil)
req.Header.Set("Authorization", "Bearer "+env.APIKey)
resp, err := env.HTTPClient.Do(req)
if err != nil {
t.Logf("Note: Namespace gateway not accessible (expected in local mode): %v", err)
t.Skip("Namespace gateway endpoint not available")
}
defer resp.Body.Close()
assert.Equal(t, http.StatusOK, resp.StatusCode, "Namespace gateway should be healthy")
t.Logf("Namespace gateway %s is accessible", nsGatewayURL)
})
}
// TestNamespaceCluster_StatusPolling tests the /v1/namespace/status endpoint
func TestNamespaceCluster_StatusPolling(t *testing.T) {
env, err := LoadTestEnv()
require.NoError(t, err, "Should load test environment")
t.Run("Status endpoint returns valid response", func(t *testing.T) {
// Test with a non-existent cluster ID (should return 404)
req, _ := http.NewRequest("GET", env.GatewayURL+"/v1/namespace/status?id=non-existent-id", nil)
resp, err := env.HTTPClient.Do(req)
require.NoError(t, err, "Should execute request")
defer resp.Body.Close()
// Should return 404 for non-existent cluster
assert.Equal(t, http.StatusNotFound, resp.StatusCode, "Should return 404 for non-existent cluster")
})
}
// TestNamespaceCluster_CrossGatewayAccess tests that API keys from one namespace
// cannot access another namespace's dedicated gateway
func TestNamespaceCluster_CrossGatewayAccess(t *testing.T) {
if !IsProductionMode() {
t.Skip("Cross-gateway access control only applies in production mode")
}
// Create two namespaces
nsA := fmt.Sprintf("ns-a-%d", time.Now().Unix())
nsB := fmt.Sprintf("ns-b-%d", time.Now().Unix())
envA, err := LoadTestEnvWithNamespace(nsA)
require.NoError(t, err, "Should create test environment for namespace A")
envB, err := LoadTestEnvWithNamespace(nsB)
require.NoError(t, err, "Should create test environment for namespace B")
cfg, _ := LoadE2EConfig()
if cfg.BaseDomain == "" {
cfg.BaseDomain = "devnet-orama.network"
}
t.Run("Namespace A key cannot access Namespace B gateway", func(t *testing.T) {
// Try to use namespace A's key on namespace B's gateway
nsBGatewayURL := fmt.Sprintf("https://ns-%s.%s", nsB, cfg.BaseDomain)
req, _ := http.NewRequest("GET", nsBGatewayURL+"/v1/deployments/list", nil)
req.Header.Set("Authorization", "Bearer "+envA.APIKey) // Using A's key
resp, err := envA.HTTPClient.Do(req)
if err != nil {
t.Logf("Note: Gateway not accessible: %v", err)
t.Skip("Namespace gateway endpoint not available")
}
defer resp.Body.Close()
assert.Equal(t, http.StatusForbidden, resp.StatusCode,
"Should deny namespace A's key on namespace B's gateway")
t.Logf("Cross-namespace access correctly denied (status: %d)", resp.StatusCode)
})
t.Run("Namespace B key works on Namespace B gateway", func(t *testing.T) {
nsBGatewayURL := fmt.Sprintf("https://ns-%s.%s", nsB, cfg.BaseDomain)
req, _ := http.NewRequest("GET", nsBGatewayURL+"/v1/deployments/list", nil)
req.Header.Set("Authorization", "Bearer "+envB.APIKey) // Using B's key
resp, err := envB.HTTPClient.Do(req)
if err != nil {
t.Logf("Note: Gateway not accessible: %v", err)
t.Skip("Namespace gateway endpoint not available")
}
defer resp.Body.Close()
assert.Equal(t, http.StatusOK, resp.StatusCode,
"Should allow namespace B's key on namespace B's gateway")
t.Logf("Same-namespace access correctly allowed")
})
}
// TestNamespaceCluster_DefaultNamespaceAccessible tests that the default namespace
// is accessible by any valid API key
func TestNamespaceCluster_DefaultNamespaceAccessible(t *testing.T) {
// Create a non-default namespace
customNS := fmt.Sprintf("custom-%d", time.Now().Unix())
env, err := LoadTestEnvWithNamespace(customNS)
require.NoError(t, err, "Should create test environment")
t.Run("Custom namespace key can access default gateway endpoints", func(t *testing.T) {
// The default gateway should accept keys from any namespace
req, _ := http.NewRequest("GET", env.GatewayURL+"/v1/health", nil)
req.Header.Set("Authorization", "Bearer "+env.APIKey)
resp, err := env.HTTPClient.Do(req)
require.NoError(t, err, "Should execute request")
defer resp.Body.Close()
assert.Equal(t, http.StatusOK, resp.StatusCode,
"Default gateway should accept any valid API key")
})
}
// TestDeployment_RandomSubdomain tests that deployments get random subdomain suffix
func TestDeployment_RandomSubdomain(t *testing.T) {
env, err := LoadTestEnv()
require.NoError(t, err, "Should load test environment")
tarballPath := filepath.Join("../testdata/tarballs/react-vite.tar.gz")
// Create a deployment
deploymentName := "subdomain-test"
deploymentID := CreateTestDeployment(t, env, deploymentName, tarballPath)
defer func() {
if !env.SkipCleanup {
DeleteDeployment(t, env, deploymentID)
}
}()
t.Run("Deployment URL contains random suffix", func(t *testing.T) {
// Get deployment details
req, _ := http.NewRequest("GET", env.GatewayURL+"/v1/deployments/get?id="+deploymentID, nil)
req.Header.Set("Authorization", "Bearer "+env.APIKey)
resp, err := env.HTTPClient.Do(req)
require.NoError(t, err, "Should execute request")
defer resp.Body.Close()
require.Equal(t, http.StatusOK, resp.StatusCode, "Should get deployment")
var result map[string]interface{}
bodyBytes, _ := io.ReadAll(resp.Body)
require.NoError(t, json.Unmarshal(bodyBytes, &result), "Should decode JSON")
deployment, ok := result["deployment"].(map[string]interface{})
if !ok {
deployment = result
}
// Check subdomain field
subdomain, _ := deployment["subdomain"].(string)
if subdomain != "" {
// Subdomain should follow format: {name}-{random}
// e.g., "subdomain-test-f3o4if"
assert.True(t, strings.HasPrefix(subdomain, deploymentName+"-"),
"Subdomain should start with deployment name followed by dash")
suffix := strings.TrimPrefix(subdomain, deploymentName+"-")
assert.Equal(t, 6, len(suffix), "Random suffix should be 6 characters")
t.Logf("Deployment subdomain: %s (suffix: %s)", subdomain, suffix)
} else {
t.Logf("Note: Subdomain field not set (may be using legacy format)")
}
// Check URLs
urls, ok := deployment["urls"].([]interface{})
if ok && len(urls) > 0 {
url := urls[0].(string)
t.Logf("Deployment URL: %s", url)
// URL should contain the subdomain with random suffix
if subdomain != "" {
assert.Contains(t, url, subdomain, "URL should contain the subdomain")
}
}
})
}
// TestDeployment_SubdomainUniqueness tests that two deployments with the same name
// get different subdomains
func TestDeployment_SubdomainUniqueness(t *testing.T) {
envA, err := LoadTestEnvWithNamespace("ns-unique-a-" + fmt.Sprintf("%d", time.Now().Unix()))
require.NoError(t, err, "Should create test environment A")
envB, err := LoadTestEnvWithNamespace("ns-unique-b-" + fmt.Sprintf("%d", time.Now().Unix()))
require.NoError(t, err, "Should create test environment B")
tarballPath := filepath.Join("../testdata/tarballs/react-vite.tar.gz")
deploymentName := "same-name-app"
// Create deployment in namespace A
deploymentIDA := CreateTestDeployment(t, envA, deploymentName, tarballPath)
defer func() {
if !envA.SkipCleanup {
DeleteDeployment(t, envA, deploymentIDA)
}
}()
// Create deployment with same name in namespace B
deploymentIDB := CreateTestDeployment(t, envB, deploymentName, tarballPath)
defer func() {
if !envB.SkipCleanup {
DeleteDeployment(t, envB, deploymentIDB)
}
}()
t.Run("Same name deployments have different subdomains", func(t *testing.T) {
// Get deployment A details
reqA, _ := http.NewRequest("GET", envA.GatewayURL+"/v1/deployments/get?id="+deploymentIDA, nil)
reqA.Header.Set("Authorization", "Bearer "+envA.APIKey)
respA, _ := envA.HTTPClient.Do(reqA)
defer respA.Body.Close()
var resultA map[string]interface{}
bodyBytesA, _ := io.ReadAll(respA.Body)
json.Unmarshal(bodyBytesA, &resultA)
deploymentA, ok := resultA["deployment"].(map[string]interface{})
if !ok {
deploymentA = resultA
}
subdomainA, _ := deploymentA["subdomain"].(string)
// Get deployment B details
reqB, _ := http.NewRequest("GET", envB.GatewayURL+"/v1/deployments/get?id="+deploymentIDB, nil)
reqB.Header.Set("Authorization", "Bearer "+envB.APIKey)
respB, _ := envB.HTTPClient.Do(reqB)
defer respB.Body.Close()
var resultB map[string]interface{}
bodyBytesB, _ := io.ReadAll(respB.Body)
json.Unmarshal(bodyBytesB, &resultB)
deploymentB, ok := resultB["deployment"].(map[string]interface{})
if !ok {
deploymentB = resultB
}
subdomainB, _ := deploymentB["subdomain"].(string)
// If subdomains are set, they should be different
if subdomainA != "" && subdomainB != "" {
assert.NotEqual(t, subdomainA, subdomainB,
"Same-name deployments in different namespaces should have different subdomains")
t.Logf("Namespace A subdomain: %s", subdomainA)
t.Logf("Namespace B subdomain: %s", subdomainB)
} else {
t.Logf("Note: Subdomains not set (may be using legacy format)")
}
})
}
// TestNamespaceCluster_DNSFormat tests the DNS naming convention for namespaces
func TestNamespaceCluster_DNSFormat(t *testing.T) {
cfg, err := LoadE2EConfig()
if err != nil {
cfg = DefaultConfig()
}
if cfg.BaseDomain == "" {
cfg.BaseDomain = "devnet-orama.network"
}
t.Run("Namespace gateway DNS follows ns-{name}.{baseDomain} format", func(t *testing.T) {
namespace := "my-test-namespace"
expectedDomain := fmt.Sprintf("ns-%s.%s", namespace, cfg.BaseDomain)
t.Logf("Expected namespace gateway domain: %s", expectedDomain)
// Verify format
assert.True(t, strings.HasPrefix(expectedDomain, "ns-"),
"Namespace gateway domain should start with 'ns-'")
assert.True(t, strings.HasSuffix(expectedDomain, cfg.BaseDomain),
"Namespace gateway domain should end with base domain")
})
t.Run("Deployment DNS follows {name}-{random}.{baseDomain} format", func(t *testing.T) {
deploymentName := "my-app"
randomSuffix := "f3o4if"
expectedDomain := fmt.Sprintf("%s-%s.%s", deploymentName, randomSuffix, cfg.BaseDomain)
t.Logf("Expected deployment domain: %s", expectedDomain)
// Verify format
assert.Contains(t, expectedDomain, deploymentName,
"Deployment domain should contain the deployment name")
assert.True(t, strings.HasSuffix(expectedDomain, cfg.BaseDomain),
"Deployment domain should end with base domain")
})
}
// TestNamespaceCluster_PortAllocation tests the port allocation constraints
func TestNamespaceCluster_PortAllocation(t *testing.T) {
t.Run("Port range constants are correct", func(t *testing.T) {
// These constants are defined in pkg/namespace/types.go
const (
portRangeStart = 10000
portRangeEnd = 10099
portsPerNamespace = 5
maxNamespacesPerNode = 20
)
// Verify range calculation
totalPorts := portRangeEnd - portRangeStart + 1
assert.Equal(t, 100, totalPorts, "Port range should be 100 ports")
expectedMax := totalPorts / portsPerNamespace
assert.Equal(t, maxNamespacesPerNode, expectedMax,
"Max namespaces per node should be total ports / ports per namespace")
t.Logf("Port range: %d-%d (%d ports total)", portRangeStart, portRangeEnd, totalPorts)
t.Logf("Ports per namespace: %d", portsPerNamespace)
t.Logf("Max namespaces per node: %d", maxNamespacesPerNode)
})
t.Run("Port assignments within a block are sequential", func(t *testing.T) {
portStart := 10000
rqliteHTTP := portStart + 0
rqliteRaft := portStart + 1
olricHTTP := portStart + 2
olricMemberlist := portStart + 3
gatewayHTTP := portStart + 4
// All ports should be unique
ports := []int{rqliteHTTP, rqliteRaft, olricHTTP, olricMemberlist, gatewayHTTP}
seen := make(map[int]bool)
for _, port := range ports {
assert.False(t, seen[port], "Ports should be unique within a block")
seen[port] = true
}
t.Logf("Port assignments for block starting at %d:", portStart)
t.Logf(" RQLite HTTP: %d", rqliteHTTP)
t.Logf(" RQLite Raft: %d", rqliteRaft)
t.Logf(" Olric HTTP: %d", olricHTTP)
t.Logf(" Olric Memberlist: %d", olricMemberlist)
t.Logf(" Gateway HTTP: %d", gatewayHTTP)
})
}

View File

@ -0,0 +1,190 @@
-- Migration 010: Namespace Clusters for Physical Isolation
-- Creates tables to manage per-namespace RQLite and Olric clusters
-- Each namespace gets its own 3-node cluster for complete isolation
BEGIN;
-- Extend namespaces table with cluster status tracking
-- Note: SQLite doesn't support ADD COLUMN IF NOT EXISTS, so we handle this carefully
-- These columns track the provisioning state of the namespace's dedicated cluster
-- First check if columns exist, if not add them
-- cluster_status: 'none', 'provisioning', 'ready', 'degraded', 'failed', 'deprovisioning'
-- Create a new namespaces table with additional columns if needed
CREATE TABLE IF NOT EXISTS namespaces_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE,
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
cluster_status TEXT DEFAULT 'none',
cluster_created_at TIMESTAMP,
cluster_ready_at TIMESTAMP
);
-- Copy data from old table if it exists and new columns don't
INSERT OR IGNORE INTO namespaces_new (id, name, created_at, cluster_status)
SELECT id, name, created_at, 'none' FROM namespaces WHERE NOT EXISTS (
SELECT 1 FROM pragma_table_info('namespaces') WHERE name = 'cluster_status'
);
-- If the column already exists, this migration was partially applied - skip the table swap
-- We'll use a different approach: just ensure the new tables exist
-- Namespace clusters registry
-- One record per namespace that has a dedicated cluster
CREATE TABLE IF NOT EXISTS namespace_clusters (
id TEXT PRIMARY KEY, -- UUID
namespace_id INTEGER NOT NULL UNIQUE, -- FK to namespaces
namespace_name TEXT NOT NULL, -- Cached for easier lookups
status TEXT NOT NULL DEFAULT 'provisioning', -- provisioning, ready, degraded, failed, deprovisioning
-- Cluster configuration
rqlite_node_count INTEGER NOT NULL DEFAULT 3,
olric_node_count INTEGER NOT NULL DEFAULT 3,
gateway_node_count INTEGER NOT NULL DEFAULT 3,
-- Provisioning metadata
provisioned_by TEXT NOT NULL, -- Wallet address that triggered provisioning
provisioned_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
ready_at TIMESTAMP,
last_health_check TIMESTAMP,
-- Error tracking
error_message TEXT,
retry_count INTEGER DEFAULT 0,
FOREIGN KEY (namespace_id) REFERENCES namespaces(id) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_namespace_clusters_status ON namespace_clusters(status);
CREATE INDEX IF NOT EXISTS idx_namespace_clusters_namespace ON namespace_clusters(namespace_id);
CREATE INDEX IF NOT EXISTS idx_namespace_clusters_name ON namespace_clusters(namespace_name);
-- Namespace cluster nodes
-- Tracks which physical nodes host services for each namespace cluster
CREATE TABLE IF NOT EXISTS namespace_cluster_nodes (
id TEXT PRIMARY KEY, -- UUID
namespace_cluster_id TEXT NOT NULL, -- FK to namespace_clusters
node_id TEXT NOT NULL, -- FK to dns_nodes (physical node)
-- Role in the cluster
-- Each node can have multiple roles (rqlite + olric + gateway)
role TEXT NOT NULL, -- 'rqlite_leader', 'rqlite_follower', 'olric', 'gateway'
-- Service ports (allocated from reserved range 10000-10099)
rqlite_http_port INTEGER, -- Port for RQLite HTTP API
rqlite_raft_port INTEGER, -- Port for RQLite Raft consensus
olric_http_port INTEGER, -- Port for Olric HTTP API
olric_memberlist_port INTEGER, -- Port for Olric memberlist gossip
gateway_http_port INTEGER, -- Port for Gateway HTTP
-- Service status
status TEXT NOT NULL DEFAULT 'pending', -- pending, starting, running, stopped, failed
process_pid INTEGER, -- PID of running process (for local management)
last_heartbeat TIMESTAMP,
error_message TEXT,
-- Join addresses for cluster formation
rqlite_join_address TEXT, -- Address to join RQLite cluster
olric_peers TEXT, -- JSON array of Olric peer addresses
-- Metadata
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
UNIQUE(namespace_cluster_id, node_id, role),
FOREIGN KEY (namespace_cluster_id) REFERENCES namespace_clusters(id) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_cluster_nodes_cluster ON namespace_cluster_nodes(namespace_cluster_id);
CREATE INDEX IF NOT EXISTS idx_cluster_nodes_node ON namespace_cluster_nodes(node_id);
CREATE INDEX IF NOT EXISTS idx_cluster_nodes_status ON namespace_cluster_nodes(status);
CREATE INDEX IF NOT EXISTS idx_cluster_nodes_role ON namespace_cluster_nodes(role);
-- Namespace port allocations
-- Manages the reserved port range (10000-10099) for namespace services
-- Each namespace instance on a node gets a block of 5 consecutive ports
CREATE TABLE IF NOT EXISTS namespace_port_allocations (
id TEXT PRIMARY KEY, -- UUID
node_id TEXT NOT NULL, -- Physical node ID
namespace_cluster_id TEXT NOT NULL, -- Namespace cluster this allocation belongs to
-- Port block (5 consecutive ports)
port_start INTEGER NOT NULL, -- Start of port block (e.g., 10000)
port_end INTEGER NOT NULL, -- End of port block (e.g., 10004)
-- Individual port assignments within the block
rqlite_http_port INTEGER NOT NULL, -- port_start + 0
rqlite_raft_port INTEGER NOT NULL, -- port_start + 1
olric_http_port INTEGER NOT NULL, -- port_start + 2
olric_memberlist_port INTEGER NOT NULL, -- port_start + 3
gateway_http_port INTEGER NOT NULL, -- port_start + 4
allocated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
-- Prevent overlapping allocations on same node
UNIQUE(node_id, port_start),
-- One allocation per namespace per node
UNIQUE(namespace_cluster_id, node_id),
FOREIGN KEY (namespace_cluster_id) REFERENCES namespace_clusters(id) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_ns_port_alloc_node ON namespace_port_allocations(node_id);
CREATE INDEX IF NOT EXISTS idx_ns_port_alloc_cluster ON namespace_port_allocations(namespace_cluster_id);
-- Namespace cluster events
-- Audit log for cluster provisioning and lifecycle events
CREATE TABLE IF NOT EXISTS namespace_cluster_events (
id TEXT PRIMARY KEY, -- UUID
namespace_cluster_id TEXT NOT NULL,
event_type TEXT NOT NULL, -- Event types listed below
node_id TEXT, -- Optional: specific node this event relates to
message TEXT,
metadata TEXT, -- JSON for additional event data
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (namespace_cluster_id) REFERENCES namespace_clusters(id) ON DELETE CASCADE
);
-- Event types:
-- 'provisioning_started' - Cluster provisioning began
-- 'nodes_selected' - 3 nodes were selected for the cluster
-- 'ports_allocated' - Ports allocated on a node
-- 'rqlite_started' - RQLite instance started on a node
-- 'rqlite_joined' - RQLite instance joined the cluster
-- 'rqlite_leader_elected' - RQLite leader election completed
-- 'olric_started' - Olric instance started on a node
-- 'olric_joined' - Olric instance joined memberlist
-- 'gateway_started' - Gateway instance started on a node
-- 'dns_created' - DNS records created for namespace
-- 'cluster_ready' - All services ready, cluster is operational
-- 'cluster_degraded' - One or more nodes are unhealthy
-- 'cluster_failed' - Cluster failed to provision or operate
-- 'node_failed' - Specific node became unhealthy
-- 'node_recovered' - Node recovered from failure
-- 'deprovisioning_started' - Cluster deprovisioning began
-- 'deprovisioned' - Cluster fully deprovisioned
CREATE INDEX IF NOT EXISTS idx_cluster_events_cluster ON namespace_cluster_events(namespace_cluster_id, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_cluster_events_type ON namespace_cluster_events(event_type);
-- Global deployment registry
-- Prevents duplicate deployment subdomains across all namespaces
-- Since deployments now use {name}-{random}.{domain}, we track used subdomains globally
CREATE TABLE IF NOT EXISTS global_deployment_subdomains (
subdomain TEXT PRIMARY KEY, -- Full subdomain (e.g., 'myapp-f3o4if')
namespace TEXT NOT NULL, -- Owner namespace
deployment_id TEXT NOT NULL, -- FK to deployments (in namespace cluster)
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
-- No FK to deployments since deployments are in namespace-specific clusters
UNIQUE(subdomain)
);
CREATE INDEX IF NOT EXISTS idx_global_subdomains_namespace ON global_deployment_subdomains(namespace);
CREATE INDEX IF NOT EXISTS idx_global_subdomains_deployment ON global_deployment_subdomains(deployment_id);
-- Mark migration as applied
INSERT OR IGNORE INTO schema_migrations(version) VALUES (10);
COMMIT;

View File

@ -85,6 +85,7 @@ func PerformSimpleAuthentication(gatewayURL string) (*Credentials, error) {
} }
// requestAPIKeyFromGateway calls the gateway's simple-key endpoint to generate an API key // requestAPIKeyFromGateway calls the gateway's simple-key endpoint to generate an API key
// For non-default namespaces, this may trigger cluster provisioning and require polling
func requestAPIKeyFromGateway(gatewayURL, wallet, namespace string) (string, error) { func requestAPIKeyFromGateway(gatewayURL, wallet, namespace string) (string, error) {
reqBody := map[string]string{ reqBody := map[string]string{
"wallet": wallet, "wallet": wallet,
@ -109,6 +110,170 @@ func requestAPIKeyFromGateway(gatewayURL, wallet, namespace string) (string, err
} }
defer resp.Body.Close() defer resp.Body.Close()
// Handle 202 Accepted - namespace cluster is being provisioned
if resp.StatusCode == http.StatusAccepted {
return handleProvisioningResponse(gatewayURL, client, resp, wallet, namespace)
}
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return "", fmt.Errorf("gateway returned status %d: %s", resp.StatusCode, string(body))
}
var respBody map[string]interface{}
if err := json.NewDecoder(resp.Body).Decode(&respBody); err != nil {
return "", fmt.Errorf("failed to decode response: %w", err)
}
apiKey, ok := respBody["api_key"].(string)
if !ok || apiKey == "" {
return "", fmt.Errorf("no api_key in response")
}
return apiKey, nil
}
// handleProvisioningResponse handles 202 Accepted responses when namespace cluster provisioning is needed
func handleProvisioningResponse(gatewayURL string, client *http.Client, resp *http.Response, wallet, namespace string) (string, error) {
var provResp map[string]interface{}
if err := json.NewDecoder(resp.Body).Decode(&provResp); err != nil {
return "", fmt.Errorf("failed to decode provisioning response: %w", err)
}
status, _ := provResp["status"].(string)
pollURL, _ := provResp["poll_url"].(string)
clusterID, _ := provResp["cluster_id"].(string)
message, _ := provResp["message"].(string)
if status != "provisioning" {
return "", fmt.Errorf("unexpected status: %s", status)
}
fmt.Printf("\n🏗 Provisioning namespace cluster...\n")
if message != "" {
fmt.Printf(" %s\n", message)
}
if clusterID != "" {
fmt.Printf(" Cluster ID: %s\n", clusterID)
}
fmt.Println()
// Poll until cluster is ready
if err := pollProvisioningStatus(gatewayURL, client, pollURL); err != nil {
return "", err
}
// Cluster is ready, retry the API key request
fmt.Println("\n✅ Namespace cluster ready!")
fmt.Println("⏳ Retrieving API key...")
return retryAPIKeyRequest(gatewayURL, client, wallet, namespace)
}
// pollProvisioningStatus polls the status endpoint until the cluster is ready
func pollProvisioningStatus(gatewayURL string, client *http.Client, pollURL string) error {
// Build full poll URL if it's a relative path
if strings.HasPrefix(pollURL, "/") {
pollURL = gatewayURL + pollURL
}
maxAttempts := 120 // 10 minutes (5 seconds per poll)
pollInterval := 5 * time.Second
spinnerChars := []string{"⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"}
spinnerIdx := 0
for i := 0; i < maxAttempts; i++ {
// Show progress spinner
fmt.Printf("\r%s Waiting for cluster... ", spinnerChars[spinnerIdx%len(spinnerChars)])
spinnerIdx++
resp, err := client.Get(pollURL)
if err != nil {
time.Sleep(pollInterval)
continue
}
var statusResp map[string]interface{}
if err := json.NewDecoder(resp.Body).Decode(&statusResp); err != nil {
resp.Body.Close()
time.Sleep(pollInterval)
continue
}
resp.Body.Close()
status, _ := statusResp["status"].(string)
switch status {
case "ready":
fmt.Printf("\r✅ Cluster ready! \n")
return nil
case "failed":
errMsg, _ := statusResp["error"].(string)
fmt.Printf("\r❌ Provisioning failed \n")
return fmt.Errorf("cluster provisioning failed: %s", errMsg)
case "provisioning":
// Show progress details
rqliteReady, _ := statusResp["rqlite_ready"].(bool)
olricReady, _ := statusResp["olric_ready"].(bool)
gatewayReady, _ := statusResp["gateway_ready"].(bool)
dnsReady, _ := statusResp["dns_ready"].(bool)
progressStr := ""
if rqliteReady {
progressStr += "RQLite✓ "
}
if olricReady {
progressStr += "Olric✓ "
}
if gatewayReady {
progressStr += "Gateway✓ "
}
if dnsReady {
progressStr += "DNS✓"
}
if progressStr != "" {
fmt.Printf("\r%s Provisioning... [%s]", spinnerChars[spinnerIdx%len(spinnerChars)], progressStr)
}
default:
// Unknown status, continue polling
}
time.Sleep(pollInterval)
}
fmt.Printf("\r⚠ Timeout waiting for cluster \n")
return fmt.Errorf("timeout waiting for namespace cluster provisioning")
}
// retryAPIKeyRequest retries the API key request after cluster provisioning
func retryAPIKeyRequest(gatewayURL string, client *http.Client, wallet, namespace string) (string, error) {
reqBody := map[string]string{
"wallet": wallet,
"namespace": namespace,
}
payload, err := json.Marshal(reqBody)
if err != nil {
return "", fmt.Errorf("failed to marshal request: %w", err)
}
endpoint := gatewayURL + "/v1/auth/simple-key"
resp, err := client.Post(endpoint, "application/json", bytes.NewReader(payload))
if err != nil {
return "", fmt.Errorf("failed to call gateway: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusAccepted {
// Still provisioning? This shouldn't happen but handle gracefully
return "", fmt.Errorf("cluster still provisioning, please try again")
}
if resp.StatusCode != http.StatusOK { if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body) body, _ := io.ReadAll(resp.Body)
return "", fmt.Errorf("gateway returned status %d: %s", resp.StatusCode, string(body)) return "", fmt.Errorf("gateway returned status %d: %s", resp.StatusCode, string(body))

View File

@ -1,6 +1,7 @@
package install package install
import ( import (
"bufio"
"fmt" "fmt"
"os" "os"
"path/filepath" "path/filepath"
@ -25,6 +26,11 @@ func NewOrchestrator(flags *Flags) (*Orchestrator, error) {
oramaHome := "/home/debros" oramaHome := "/home/debros"
oramaDir := oramaHome + "/.orama" oramaDir := oramaHome + "/.orama"
// Prompt for base domain if not provided via flag
if flags.BaseDomain == "" {
flags.BaseDomain = promptForBaseDomain()
}
// Normalize peers // Normalize peers
peers, err := utils.NormalizePeers(flags.PeersStr) peers, err := utils.NormalizePeers(flags.PeersStr)
if err != nil { if err != nil {
@ -227,3 +233,52 @@ func (o *Orchestrator) printFirstNodeSecrets() {
fmt.Printf(" Node Peer ID:\n") fmt.Printf(" Node Peer ID:\n")
fmt.Printf(" %s\n\n", o.setup.NodePeerID) fmt.Printf(" %s\n\n", o.setup.NodePeerID)
} }
// promptForBaseDomain interactively prompts the user to select a network environment
// Returns the selected base domain for deployment routing
func promptForBaseDomain() string {
reader := bufio.NewReader(os.Stdin)
fmt.Println("\n🌐 Network Environment Selection")
fmt.Println("=================================")
fmt.Println("Select the network environment for this node:")
fmt.Println()
fmt.Println(" 1. devnet-orama.network (Development - for testing)")
fmt.Println(" 2. testnet-orama.network (Testnet - pre-production)")
fmt.Println(" 3. mainnet-orama.network (Mainnet - production)")
fmt.Println(" 4. Custom domain...")
fmt.Println()
fmt.Print("Select option [1-4] (default: 1): ")
choice, _ := reader.ReadString('\n')
choice = strings.TrimSpace(choice)
switch choice {
case "", "1":
fmt.Println("✓ Selected: devnet-orama.network")
return "devnet-orama.network"
case "2":
fmt.Println("✓ Selected: testnet-orama.network")
return "testnet-orama.network"
case "3":
fmt.Println("✓ Selected: mainnet-orama.network")
return "mainnet-orama.network"
case "4":
fmt.Print("Enter custom base domain (e.g., example.com): ")
customDomain, _ := reader.ReadString('\n')
customDomain = strings.TrimSpace(customDomain)
if customDomain == "" {
fmt.Println("⚠️ No domain entered, using devnet-orama.network")
return "devnet-orama.network"
}
// Remove any protocol prefix if user included it
customDomain = strings.TrimPrefix(customDomain, "https://")
customDomain = strings.TrimPrefix(customDomain, "http://")
customDomain = strings.TrimSuffix(customDomain, "/")
fmt.Printf("✓ Selected: %s\n", customDomain)
return customDomain
default:
fmt.Println("⚠️ Invalid option, using devnet-orama.network")
return "devnet-orama.network"
}
}

View File

@ -9,10 +9,12 @@ import (
// IssueAPIKeyHandler issues an API key after signature verification. // IssueAPIKeyHandler issues an API key after signature verification.
// Similar to VerifyHandler but only returns the API key without JWT tokens. // Similar to VerifyHandler but only returns the API key without JWT tokens.
// For non-default namespaces, may trigger cluster provisioning and return 202 Accepted.
// //
// POST /v1/auth/api-key // POST /v1/auth/api-key
// Request body: APIKeyRequest // Request body: APIKeyRequest
// Response: { "api_key", "namespace", "plan", "wallet" } // Response: { "api_key", "namespace", "plan", "wallet" }
// Or 202 Accepted: { "status": "provisioning", "cluster_id", "poll_url" }
func (h *Handlers) IssueAPIKeyHandler(w http.ResponseWriter, r *http.Request) { func (h *Handlers) IssueAPIKeyHandler(w http.ResponseWriter, r *http.Request) {
if h.authService == nil { if h.authService == nil {
writeError(w, http.StatusServiceUnavailable, "auth service not initialized") writeError(w, http.StatusServiceUnavailable, "auth service not initialized")
@ -44,6 +46,56 @@ func (h *Handlers) IssueAPIKeyHandler(w http.ResponseWriter, r *http.Request) {
nsID, _ := h.resolveNamespace(ctx, req.Namespace) nsID, _ := h.resolveNamespace(ctx, req.Namespace)
h.markNonceUsed(ctx, nsID, strings.ToLower(req.Wallet), req.Nonce) h.markNonceUsed(ctx, nsID, strings.ToLower(req.Wallet), req.Nonce)
// Check if namespace cluster provisioning is needed (for non-default namespaces)
namespace := strings.TrimSpace(req.Namespace)
if namespace == "" {
namespace = "default"
}
if h.clusterProvisioner != nil && namespace != "default" {
clusterID, status, needsProvisioning, err := h.clusterProvisioner.CheckNamespaceCluster(ctx, namespace)
if err != nil {
// Log but don't fail - cluster provisioning is optional (error may just mean no cluster yet)
_ = err
} else if needsProvisioning {
// Trigger provisioning for new namespace
nsIDInt := 0
if id, ok := nsID.(int); ok {
nsIDInt = id
} else if id, ok := nsID.(int64); ok {
nsIDInt = int(id)
} else if id, ok := nsID.(float64); ok {
nsIDInt = int(id)
}
newClusterID, pollURL, provErr := h.clusterProvisioner.ProvisionNamespaceCluster(ctx, nsIDInt, namespace, req.Wallet)
if provErr != nil {
writeError(w, http.StatusInternalServerError, "failed to start cluster provisioning")
return
}
writeJSON(w, http.StatusAccepted, map[string]any{
"status": "provisioning",
"cluster_id": newClusterID,
"poll_url": pollURL,
"estimated_time_seconds": 60,
"message": "Namespace cluster is being provisioned. Poll the status URL for updates.",
})
return
} else if status == "provisioning" {
// Already provisioning, return poll URL
writeJSON(w, http.StatusAccepted, map[string]any{
"status": "provisioning",
"cluster_id": clusterID,
"poll_url": "/v1/namespace/status?id=" + clusterID,
"estimated_time_seconds": 60,
"message": "Namespace cluster is being provisioned. Poll the status URL for updates.",
})
return
}
// If status is "ready" or "default", proceed with API key generation
}
apiKey, err := h.authService.GetOrCreateAPIKey(ctx, req.Wallet, req.Namespace) apiKey, err := h.authService.GetOrCreateAPIKey(ctx, req.Wallet, req.Namespace)
if err != nil { if err != nil {
writeError(w, http.StatusInternalServerError, err.Error()) writeError(w, http.StatusInternalServerError, err.Error())

View File

@ -35,13 +35,24 @@ type QueryResult struct {
Rows []interface{} `json:"rows"` Rows []interface{} `json:"rows"`
} }
// ClusterProvisioner defines the interface for namespace cluster provisioning
type ClusterProvisioner interface {
// CheckNamespaceCluster checks if a namespace has a cluster and returns its status
// Returns: (clusterID, status, needsProvisioning, error)
CheckNamespaceCluster(ctx context.Context, namespaceName string) (string, string, bool, error)
// ProvisionNamespaceCluster triggers provisioning for a new namespace
// Returns: (clusterID, pollURL, error)
ProvisionNamespaceCluster(ctx context.Context, namespaceID int, namespaceName, wallet string) (string, string, error)
}
// Handlers holds dependencies for authentication HTTP handlers // Handlers holds dependencies for authentication HTTP handlers
type Handlers struct { type Handlers struct {
logger *logging.ColoredLogger logger *logging.ColoredLogger
authService *authsvc.Service authService *authsvc.Service
netClient NetworkClient netClient NetworkClient
defaultNS string defaultNS string
internalAuthFn func(context.Context) context.Context internalAuthFn func(context.Context) context.Context
clusterProvisioner ClusterProvisioner // Optional: for namespace cluster provisioning
} }
// NewHandlers creates a new authentication handlers instance // NewHandlers creates a new authentication handlers instance
@ -61,6 +72,11 @@ func NewHandlers(
} }
} }
// SetClusterProvisioner sets the cluster provisioner for namespace cluster management
func (h *Handlers) SetClusterProvisioner(cp ClusterProvisioner) {
h.clusterProvisioner = cp
}
// markNonceUsed marks a nonce as used in the database // markNonceUsed marks a nonce as used in the database
func (h *Handlers) markNonceUsed(ctx context.Context, namespaceID interface{}, wallet, nonce string) { func (h *Handlers) markNonceUsed(ctx context.Context, namespaceID interface{}, wallet, nonce string) {
if h.netClient == nil { if h.netClient == nil {

View File

@ -2,8 +2,10 @@ package deployments
import ( import (
"context" "context"
"crypto/rand"
"encoding/json" "encoding/json"
"fmt" "fmt"
"strings"
"time" "time"
"github.com/DeBrosOfficial/network/pkg/deployments" "github.com/DeBrosOfficial/network/pkg/deployments"
@ -12,6 +14,13 @@ import (
"go.uber.org/zap" "go.uber.org/zap"
) )
const (
// subdomainSuffixLength is the length of the random suffix for deployment subdomains
subdomainSuffixLength = 6
// subdomainSuffixChars are the allowed characters for the random suffix (lowercase alphanumeric)
subdomainSuffixChars = "abcdefghijklmnopqrstuvwxyz0123456789"
)
// DeploymentService manages deployment operations // DeploymentService manages deployment operations
type DeploymentService struct { type DeploymentService struct {
db rqlite.Client db rqlite.Client
@ -74,6 +83,87 @@ func GetShortNodeID(peerID string) string {
return "node-" + peerID[:6] return "node-" + peerID[:6]
} }
// generateRandomSuffix generates a random alphanumeric suffix for subdomains
func generateRandomSuffix(length int) string {
b := make([]byte, length)
if _, err := rand.Read(b); err != nil {
// Fallback to timestamp-based if crypto/rand fails
return fmt.Sprintf("%06x", time.Now().UnixNano()%0xffffff)
}
for i := range b {
b[i] = subdomainSuffixChars[int(b[i])%len(subdomainSuffixChars)]
}
return string(b)
}
// generateSubdomain generates a unique subdomain for a deployment
// Format: {name}-{random} (e.g., "myapp-f3o4if")
func (s *DeploymentService) generateSubdomain(ctx context.Context, name, namespace, deploymentID string) (string, error) {
// Sanitize name for subdomain (lowercase, alphanumeric and hyphens only)
sanitizedName := strings.ToLower(name)
sanitizedName = strings.Map(func(r rune) rune {
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' {
return r
}
return '-'
}, sanitizedName)
// Remove consecutive hyphens and trim
for strings.Contains(sanitizedName, "--") {
sanitizedName = strings.ReplaceAll(sanitizedName, "--", "-")
}
sanitizedName = strings.Trim(sanitizedName, "-")
// Try to generate a unique subdomain (max 10 attempts)
for i := 0; i < 10; i++ {
suffix := generateRandomSuffix(subdomainSuffixLength)
subdomain := fmt.Sprintf("%s-%s", sanitizedName, suffix)
// Check if subdomain is already taken globally
exists, err := s.subdomainExists(ctx, subdomain)
if err != nil {
return "", fmt.Errorf("failed to check subdomain: %w", err)
}
if !exists {
// Register the subdomain globally
if err := s.registerSubdomain(ctx, subdomain, namespace, deploymentID); err != nil {
// If registration fails (race condition), try again
s.logger.Warn("Failed to register subdomain, retrying",
zap.String("subdomain", subdomain),
zap.Error(err),
)
continue
}
return subdomain, nil
}
}
return "", fmt.Errorf("failed to generate unique subdomain after 10 attempts")
}
// subdomainExists checks if a subdomain is already registered globally
func (s *DeploymentService) subdomainExists(ctx context.Context, subdomain string) (bool, error) {
type existsRow struct {
Exists int `db:"exists"`
}
var rows []existsRow
query := `SELECT 1 as exists FROM global_deployment_subdomains WHERE subdomain = ? LIMIT 1`
err := s.db.Query(ctx, &rows, query, subdomain)
if err != nil {
return false, err
}
return len(rows) > 0, nil
}
// registerSubdomain registers a subdomain in the global registry
func (s *DeploymentService) registerSubdomain(ctx context.Context, subdomain, namespace, deploymentID string) error {
query := `
INSERT INTO global_deployment_subdomains (subdomain, namespace, deployment_id, created_at)
VALUES (?, ?, ?, ?)
`
_, err := s.db.Exec(ctx, query, subdomain, namespace, deploymentID, time.Now())
return err
}
// CreateDeployment creates a new deployment // CreateDeployment creates a new deployment
func (s *DeploymentService) CreateDeployment(ctx context.Context, deployment *deployments.Deployment) error { func (s *DeploymentService) CreateDeployment(ctx context.Context, deployment *deployments.Deployment) error {
// Always use current node's peer ID for home node // Always use current node's peer ID for home node
@ -90,6 +180,16 @@ func (s *DeploymentService) CreateDeployment(ctx context.Context, deployment *de
deployment.HomeNodeID = homeNodeID deployment.HomeNodeID = homeNodeID
} }
// Generate unique subdomain with random suffix if not already set
// Format: {name}-{random} (e.g., "myapp-f3o4if")
if deployment.Subdomain == "" {
subdomain, err := s.generateSubdomain(ctx, deployment.Name, deployment.Namespace, deployment.ID)
if err != nil {
return fmt.Errorf("failed to generate subdomain: %w", err)
}
deployment.Subdomain = subdomain
}
// Allocate port for dynamic deployments // Allocate port for dynamic deployments
if deployment.Type != deployments.DeploymentTypeStatic && deployment.Type != deployments.DeploymentTypeNextJSStatic { if deployment.Type != deployments.DeploymentTypeStatic && deployment.Type != deployments.DeploymentTypeNextJSStatic {
port, err := s.portAllocator.AllocatePort(ctx, deployment.HomeNodeID, deployment.ID) port, err := s.portAllocator.AllocatePort(ctx, deployment.HomeNodeID, deployment.ID)
@ -307,13 +407,24 @@ func (s *DeploymentService) CreateDNSRecords(ctx context.Context, deployment *de
return err return err
} }
// Create deployment record: {name}.{baseDomain} // Use subdomain if set, otherwise fall back to name
// New format: {name}-{random}.{baseDomain} (e.g., myapp-f3o4if.dbrs.space)
dnsName := deployment.Subdomain
if dnsName == "" {
dnsName = deployment.Name
}
// Create deployment record: {subdomain}.{baseDomain}
// Any node can receive the request and proxy to the home node if needed // Any node can receive the request and proxy to the home node if needed
fqdn := fmt.Sprintf("%s.%s.", deployment.Name, s.BaseDomain()) fqdn := fmt.Sprintf("%s.%s.", dnsName, s.BaseDomain())
if err := s.createDNSRecord(ctx, fqdn, "A", nodeIP, deployment.Namespace, deployment.ID); err != nil { if err := s.createDNSRecord(ctx, fqdn, "A", nodeIP, deployment.Namespace, deployment.ID); err != nil {
s.logger.Error("Failed to create DNS record", zap.Error(err)) s.logger.Error("Failed to create DNS record", zap.Error(err))
} else { } else {
s.logger.Info("Created DNS record", zap.String("fqdn", fqdn), zap.String("ip", nodeIP)) s.logger.Info("Created DNS record",
zap.String("fqdn", fqdn),
zap.String("ip", nodeIP),
zap.String("subdomain", dnsName),
)
} }
return nil return nil
@ -373,9 +484,14 @@ func (s *DeploymentService) getNodeIP(ctx context.Context, nodeID string) (strin
// BuildDeploymentURLs builds all URLs for a deployment // BuildDeploymentURLs builds all URLs for a deployment
func (s *DeploymentService) BuildDeploymentURLs(deployment *deployments.Deployment) []string { func (s *DeploymentService) BuildDeploymentURLs(deployment *deployments.Deployment) []string {
// Simple URL format: {name}.{baseDomain} // Use subdomain if set, otherwise fall back to name
// New format: {name}-{random}.{baseDomain} (e.g., myapp-f3o4if.dbrs.space)
dnsName := deployment.Subdomain
if dnsName == "" {
dnsName = deployment.Name
}
return []string{ return []string{
fmt.Sprintf("https://%s.%s", deployment.Name, s.BaseDomain()), fmt.Sprintf("https://%s.%s", dnsName, s.BaseDomain()),
} }
} }

View File

@ -0,0 +1,206 @@
// Package namespace provides HTTP handlers for namespace cluster operations
package namespace
import (
"encoding/json"
"net/http"
"github.com/DeBrosOfficial/network/pkg/logging"
ns "github.com/DeBrosOfficial/network/pkg/namespace"
"go.uber.org/zap"
)
// StatusHandler handles namespace cluster status requests
type StatusHandler struct {
clusterManager *ns.ClusterManager
logger *zap.Logger
}
// NewStatusHandler creates a new namespace status handler
func NewStatusHandler(clusterManager *ns.ClusterManager, logger *logging.ColoredLogger) *StatusHandler {
return &StatusHandler{
clusterManager: clusterManager,
logger: logger.Logger.With(zap.String("handler", "namespace-status")),
}
}
// StatusResponse represents the response for /v1/namespace/status
type StatusResponse struct {
ClusterID string `json:"cluster_id"`
Namespace string `json:"namespace"`
Status string `json:"status"`
Nodes []string `json:"nodes"`
RQLiteReady bool `json:"rqlite_ready"`
OlricReady bool `json:"olric_ready"`
GatewayReady bool `json:"gateway_ready"`
DNSReady bool `json:"dns_ready"`
Error string `json:"error,omitempty"`
GatewayURL string `json:"gateway_url,omitempty"`
}
// Handle handles GET /v1/namespace/status?id={cluster_id}
func (h *StatusHandler) Handle(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
writeError(w, http.StatusMethodNotAllowed, "method not allowed")
return
}
clusterID := r.URL.Query().Get("id")
if clusterID == "" {
writeError(w, http.StatusBadRequest, "cluster_id parameter required")
return
}
ctx := r.Context()
status, err := h.clusterManager.GetClusterStatus(ctx, clusterID)
if err != nil {
h.logger.Error("Failed to get cluster status",
zap.String("cluster_id", clusterID),
zap.Error(err),
)
writeError(w, http.StatusNotFound, "cluster not found")
return
}
resp := StatusResponse{
ClusterID: status.ClusterID,
Namespace: status.Namespace,
Status: string(status.Status),
Nodes: status.Nodes,
RQLiteReady: status.RQLiteReady,
OlricReady: status.OlricReady,
GatewayReady: status.GatewayReady,
DNSReady: status.DNSReady,
Error: status.Error,
}
// Include gateway URL when ready
if status.Status == ns.ClusterStatusReady {
// Gateway URL would be constructed from cluster configuration
// For now, we'll leave it empty and let the client construct it
}
writeJSON(w, http.StatusOK, resp)
}
// HandleByName handles GET /v1/namespace/status/name/{namespace}
func (h *StatusHandler) HandleByName(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
writeError(w, http.StatusMethodNotAllowed, "method not allowed")
return
}
// Extract namespace from path
path := r.URL.Path
namespace := ""
const prefix = "/v1/namespace/status/name/"
if len(path) > len(prefix) {
namespace = path[len(prefix):]
}
if namespace == "" {
writeError(w, http.StatusBadRequest, "namespace parameter required")
return
}
ctx := r.Context()
cluster, err := h.clusterManager.GetClusterByNamespaceName(ctx, namespace)
if err != nil {
h.logger.Debug("Cluster not found for namespace",
zap.String("namespace", namespace),
zap.Error(err),
)
writeError(w, http.StatusNotFound, "cluster not found for namespace")
return
}
status, err := h.clusterManager.GetClusterStatus(ctx, cluster.ID)
if err != nil {
writeError(w, http.StatusInternalServerError, "failed to get cluster status")
return
}
resp := StatusResponse{
ClusterID: status.ClusterID,
Namespace: status.Namespace,
Status: string(status.Status),
Nodes: status.Nodes,
RQLiteReady: status.RQLiteReady,
OlricReady: status.OlricReady,
GatewayReady: status.GatewayReady,
DNSReady: status.DNSReady,
Error: status.Error,
}
writeJSON(w, http.StatusOK, resp)
}
// ProvisionRequest represents a request to provision a new namespace cluster
type ProvisionRequest struct {
Namespace string `json:"namespace"`
ProvisionedBy string `json:"provisioned_by"` // Wallet address
}
// ProvisionResponse represents the response when provisioning starts
type ProvisionResponse struct {
Status string `json:"status"`
ClusterID string `json:"cluster_id"`
PollURL string `json:"poll_url"`
EstimatedTimeSeconds int `json:"estimated_time_seconds"`
}
// HandleProvision handles POST /v1/namespace/provision
func (h *StatusHandler) HandleProvision(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
writeError(w, http.StatusMethodNotAllowed, "method not allowed")
return
}
var req ProvisionRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeError(w, http.StatusBadRequest, "invalid json body")
return
}
if req.Namespace == "" || req.ProvisionedBy == "" {
writeError(w, http.StatusBadRequest, "namespace and provisioned_by are required")
return
}
// Don't allow provisioning the "default" namespace this way
if req.Namespace == "default" {
writeError(w, http.StatusBadRequest, "cannot provision the default namespace")
return
}
ctx := r.Context()
// Check if namespace exists
// For now, we assume the namespace ID is passed or we look it up
// This would typically be done through the auth service
// For simplicity, we'll use a placeholder namespace ID
h.logger.Info("Namespace provisioning requested",
zap.String("namespace", req.Namespace),
zap.String("provisioned_by", req.ProvisionedBy),
)
// Note: In a full implementation, we'd look up the namespace ID from the database
// For now, we'll create a placeholder that indicates provisioning should happen
// The actual provisioning is triggered through the auth flow
writeJSON(w, http.StatusAccepted, map[string]interface{}{
"status": "accepted",
"message": "Provisioning request accepted. Use auth flow to provision namespace cluster.",
})
}
func writeJSON(w http.ResponseWriter, status int, data interface{}) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
json.NewEncoder(w).Encode(data)
}
func writeError(w http.ResponseWriter, status int, message string) {
writeJSON(w, status, map[string]string{"error": message})
}

View File

@ -0,0 +1,469 @@
package gateway
import (
"context"
"fmt"
"net/http"
"os"
"os/exec"
"path/filepath"
"strings"
"sync"
"time"
"github.com/DeBrosOfficial/network/pkg/tlsutil"
"go.uber.org/zap"
"gopkg.in/yaml.v3"
)
// InstanceNodeStatus represents the status of an instance (local type to avoid import cycle)
type InstanceNodeStatus string
const (
InstanceStatusPending InstanceNodeStatus = "pending"
InstanceStatusStarting InstanceNodeStatus = "starting"
InstanceStatusRunning InstanceNodeStatus = "running"
InstanceStatusStopped InstanceNodeStatus = "stopped"
InstanceStatusFailed InstanceNodeStatus = "failed"
)
// InstanceError represents an error during instance operations (local type to avoid import cycle)
type InstanceError struct {
Message string
Cause error
}
func (e *InstanceError) Error() string {
if e.Cause != nil {
return e.Message + ": " + e.Cause.Error()
}
return e.Message
}
func (e *InstanceError) Unwrap() error {
return e.Cause
}
// InstanceSpawner manages multiple Gateway instances for namespace clusters.
// Each namespace gets its own gateway instances that connect to its dedicated RQLite and Olric clusters.
type InstanceSpawner struct {
logger *zap.Logger
baseDir string // Base directory for all namespace data (e.g., ~/.orama/data/namespaces)
instances map[string]*GatewayInstance
mu sync.RWMutex
}
// GatewayInstance represents a running Gateway instance for a namespace
type GatewayInstance struct {
Namespace string
NodeID string
HTTPPort int
BaseDomain string
RQLiteDSN string // Connection to namespace RQLite
OlricServers []string // Connection to namespace Olric
ConfigPath string
PID int
Status InstanceNodeStatus
StartedAt time.Time
LastHealthCheck time.Time
cmd *exec.Cmd
logger *zap.Logger
}
// InstanceConfig holds configuration for spawning a Gateway instance
type InstanceConfig struct {
Namespace string // Namespace name (e.g., "alice")
NodeID string // Physical node ID
HTTPPort int // HTTP API port
BaseDomain string // Base domain (e.g., "devnet-orama.network")
RQLiteDSN string // RQLite connection DSN (e.g., "http://localhost:10000")
OlricServers []string // Olric server addresses
NodePeerID string // Physical node's peer ID for home node management
DataDir string // Data directory for deployments, SQLite, etc.
}
// GatewayYAMLConfig represents the gateway YAML configuration structure
type GatewayYAMLConfig struct {
ListenAddr string `yaml:"listen_addr"`
ClientNamespace string `yaml:"client_namespace"`
RQLiteDSN string `yaml:"rqlite_dsn"`
OlricServers []string `yaml:"olric_servers"`
BaseDomain string `yaml:"base_domain"`
NodePeerID string `yaml:"node_peer_id"`
DataDir string `yaml:"data_dir"`
}
// NewInstanceSpawner creates a new Gateway instance spawner
func NewInstanceSpawner(baseDir string, logger *zap.Logger) *InstanceSpawner {
return &InstanceSpawner{
logger: logger.With(zap.String("component", "gateway-instance-spawner")),
baseDir: baseDir,
instances: make(map[string]*GatewayInstance),
}
}
// instanceKey generates a unique key for an instance based on namespace and node
func instanceKey(ns, nodeID string) string {
return fmt.Sprintf("%s:%s", ns, nodeID)
}
// SpawnInstance starts a new Gateway instance for a namespace on a specific node.
// Returns the instance info or an error if spawning fails.
func (is *InstanceSpawner) SpawnInstance(ctx context.Context, cfg InstanceConfig) (*GatewayInstance, error) {
key := instanceKey(cfg.Namespace, cfg.NodeID)
is.mu.Lock()
if existing, ok := is.instances[key]; ok {
is.mu.Unlock()
// Instance already exists, return it if running
if existing.Status == InstanceStatusRunning {
return existing, nil
}
// Otherwise, remove it and start fresh
is.mu.Lock()
delete(is.instances, key)
}
is.mu.Unlock()
// Create config and logs directories
configDir := filepath.Join(is.baseDir, cfg.Namespace, "configs")
logsDir := filepath.Join(is.baseDir, cfg.Namespace, "logs")
dataDir := filepath.Join(is.baseDir, cfg.Namespace, "data")
for _, dir := range []string{configDir, logsDir, dataDir} {
if err := os.MkdirAll(dir, 0755); err != nil {
return nil, &InstanceError{
Message: fmt.Sprintf("failed to create directory %s", dir),
Cause: err,
}
}
}
// Generate config file
configPath := filepath.Join(configDir, fmt.Sprintf("gateway-%s.yaml", cfg.NodeID))
if err := is.generateConfig(configPath, cfg, dataDir); err != nil {
return nil, err
}
instance := &GatewayInstance{
Namespace: cfg.Namespace,
NodeID: cfg.NodeID,
HTTPPort: cfg.HTTPPort,
BaseDomain: cfg.BaseDomain,
RQLiteDSN: cfg.RQLiteDSN,
OlricServers: cfg.OlricServers,
ConfigPath: configPath,
Status: InstanceStatusStarting,
logger: is.logger.With(zap.String("namespace", cfg.Namespace), zap.String("node_id", cfg.NodeID)),
}
instance.logger.Info("Starting Gateway instance",
zap.Int("http_port", cfg.HTTPPort),
zap.String("rqlite_dsn", cfg.RQLiteDSN),
zap.Strings("olric_servers", cfg.OlricServers),
)
// Find the gateway binary (should be in same directory as the current process or PATH)
gatewayBinary := "gateway"
// Create command
cmd := exec.CommandContext(ctx, gatewayBinary, "--config", configPath)
instance.cmd = cmd
// Setup logging
logPath := filepath.Join(logsDir, fmt.Sprintf("gateway-%s.log", cfg.NodeID))
logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
if err != nil {
return nil, &InstanceError{
Message: "failed to open log file",
Cause: err,
}
}
cmd.Stdout = logFile
cmd.Stderr = logFile
// Start the process
if err := cmd.Start(); err != nil {
logFile.Close()
return nil, &InstanceError{
Message: "failed to start Gateway process",
Cause: err,
}
}
logFile.Close()
instance.PID = cmd.Process.Pid
instance.StartedAt = time.Now()
// Store instance
is.mu.Lock()
is.instances[key] = instance
is.mu.Unlock()
// Wait for instance to be ready
if err := is.waitForInstanceReady(ctx, instance); err != nil {
// Kill the process on failure
if cmd.Process != nil {
_ = cmd.Process.Kill()
}
is.mu.Lock()
delete(is.instances, key)
is.mu.Unlock()
return nil, &InstanceError{
Message: "Gateway instance did not become ready",
Cause: err,
}
}
instance.Status = InstanceStatusRunning
instance.LastHealthCheck = time.Now()
instance.logger.Info("Gateway instance started successfully",
zap.Int("pid", instance.PID),
)
// Start background process monitor
go is.monitorInstance(instance)
return instance, nil
}
// generateConfig generates the Gateway YAML configuration file
func (is *InstanceSpawner) generateConfig(configPath string, cfg InstanceConfig, dataDir string) error {
gatewayCfg := GatewayYAMLConfig{
ListenAddr: fmt.Sprintf(":%d", cfg.HTTPPort),
ClientNamespace: cfg.Namespace,
RQLiteDSN: cfg.RQLiteDSN,
OlricServers: cfg.OlricServers,
BaseDomain: cfg.BaseDomain,
NodePeerID: cfg.NodePeerID,
DataDir: dataDir,
}
data, err := yaml.Marshal(gatewayCfg)
if err != nil {
return &InstanceError{
Message: "failed to marshal Gateway config",
Cause: err,
}
}
if err := os.WriteFile(configPath, data, 0644); err != nil {
return &InstanceError{
Message: "failed to write Gateway config",
Cause: err,
}
}
return nil
}
// StopInstance stops a Gateway instance for a namespace on a specific node
func (is *InstanceSpawner) StopInstance(ctx context.Context, ns, nodeID string) error {
key := instanceKey(ns, nodeID)
is.mu.Lock()
instance, ok := is.instances[key]
if !ok {
is.mu.Unlock()
return nil // Already stopped
}
delete(is.instances, key)
is.mu.Unlock()
if instance.cmd != nil && instance.cmd.Process != nil {
instance.logger.Info("Stopping Gateway instance", zap.Int("pid", instance.PID))
// Send SIGTERM for graceful shutdown
if err := instance.cmd.Process.Signal(os.Interrupt); err != nil {
// If SIGTERM fails, kill it
_ = instance.cmd.Process.Kill()
}
// Wait for process to exit with timeout
done := make(chan error, 1)
go func() {
done <- instance.cmd.Wait()
}()
select {
case <-done:
instance.logger.Info("Gateway instance stopped gracefully")
case <-time.After(10 * time.Second):
instance.logger.Warn("Gateway instance did not stop gracefully, killing")
_ = instance.cmd.Process.Kill()
case <-ctx.Done():
_ = instance.cmd.Process.Kill()
return ctx.Err()
}
}
instance.Status = InstanceStatusStopped
return nil
}
// StopAllInstances stops all Gateway instances for a namespace
func (is *InstanceSpawner) StopAllInstances(ctx context.Context, ns string) error {
is.mu.RLock()
var keys []string
for key, inst := range is.instances {
if inst.Namespace == ns {
keys = append(keys, key)
}
}
is.mu.RUnlock()
var lastErr error
for _, key := range keys {
parts := strings.SplitN(key, ":", 2)
if len(parts) == 2 {
if err := is.StopInstance(ctx, parts[0], parts[1]); err != nil {
lastErr = err
}
}
}
return lastErr
}
// GetInstance returns the instance for a namespace on a specific node
func (is *InstanceSpawner) GetInstance(ns, nodeID string) (*GatewayInstance, bool) {
is.mu.RLock()
defer is.mu.RUnlock()
instance, ok := is.instances[instanceKey(ns, nodeID)]
return instance, ok
}
// GetNamespaceInstances returns all instances for a namespace
func (is *InstanceSpawner) GetNamespaceInstances(ns string) []*GatewayInstance {
is.mu.RLock()
defer is.mu.RUnlock()
var instances []*GatewayInstance
for _, inst := range is.instances {
if inst.Namespace == ns {
instances = append(instances, inst)
}
}
return instances
}
// HealthCheck checks if an instance is healthy
func (is *InstanceSpawner) HealthCheck(ctx context.Context, ns, nodeID string) (bool, error) {
instance, ok := is.GetInstance(ns, nodeID)
if !ok {
return false, &InstanceError{Message: "instance not found"}
}
healthy, err := instance.IsHealthy(ctx)
if healthy {
is.mu.Lock()
instance.LastHealthCheck = time.Now()
is.mu.Unlock()
}
return healthy, err
}
// waitForInstanceReady waits for the Gateway instance to be ready
func (is *InstanceSpawner) waitForInstanceReady(ctx context.Context, instance *GatewayInstance) error {
client := tlsutil.NewHTTPClient(2 * time.Second)
// Gateway health check endpoint
url := fmt.Sprintf("http://localhost:%d/v1/health", instance.HTTPPort)
maxAttempts := 120 // 2 minutes
for i := 0; i < maxAttempts; i++ {
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(1 * time.Second):
}
resp, err := client.Get(url)
if err != nil {
continue
}
resp.Body.Close()
if resp.StatusCode == http.StatusOK {
instance.logger.Debug("Gateway instance ready",
zap.Int("attempts", i+1),
)
return nil
}
}
return fmt.Errorf("Gateway did not become ready within timeout")
}
// monitorInstance monitors an instance and updates its status
func (is *InstanceSpawner) monitorInstance(instance *GatewayInstance) {
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
for range ticker.C {
is.mu.RLock()
key := instanceKey(instance.Namespace, instance.NodeID)
_, exists := is.instances[key]
is.mu.RUnlock()
if !exists {
// Instance was removed
return
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
healthy, _ := instance.IsHealthy(ctx)
cancel()
is.mu.Lock()
if healthy {
instance.Status = InstanceStatusRunning
instance.LastHealthCheck = time.Now()
} else {
instance.Status = InstanceStatusFailed
instance.logger.Warn("Gateway instance health check failed")
}
is.mu.Unlock()
// Check if process is still running
if instance.cmd != nil && instance.cmd.ProcessState != nil && instance.cmd.ProcessState.Exited() {
is.mu.Lock()
instance.Status = InstanceStatusStopped
is.mu.Unlock()
instance.logger.Warn("Gateway instance process exited unexpectedly")
return
}
}
}
// IsHealthy checks if the Gateway instance is healthy
func (gi *GatewayInstance) IsHealthy(ctx context.Context) (bool, error) {
url := fmt.Sprintf("http://localhost:%d/v1/health", gi.HTTPPort)
client := tlsutil.NewHTTPClient(5 * time.Second)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return false, err
}
resp, err := client.Do(req)
if err != nil {
return false, err
}
defer resp.Body.Close()
return resp.StatusCode == http.StatusOK, nil
}
// DSN returns the local connection address for this Gateway instance
func (gi *GatewayInstance) DSN() string {
return fmt.Sprintf("http://localhost:%d", gi.HTTPPort)
}
// ExternalURL returns the external URL for accessing this namespace's gateway
func (gi *GatewayInstance) ExternalURL() string {
return fmt.Sprintf("https://ns-%s.%s", gi.Namespace, gi.BaseDomain)
}

View File

@ -207,6 +207,9 @@ func isPublicPath(p string) bool {
// authorizationMiddleware enforces that the authenticated actor owns the namespace // authorizationMiddleware enforces that the authenticated actor owns the namespace
// for certain protected paths (e.g., apps CRUD and storage APIs). // for certain protected paths (e.g., apps CRUD and storage APIs).
// Also enforces cross-namespace access control:
// - "default" namespace: accessible by any valid API key
// - Other namespaces: API key must belong to that specific namespace
func (g *Gateway) authorizationMiddleware(next http.Handler) http.Handler { func (g *Gateway) authorizationMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// Skip for public/OPTIONS paths only // Skip for public/OPTIONS paths only
@ -221,7 +224,40 @@ func (g *Gateway) authorizationMiddleware(next http.Handler) http.Handler {
return return
} }
// Only enforce for specific resource paths // Exempt namespace status endpoint
if strings.HasPrefix(r.URL.Path, "/v1/namespace/status") {
next.ServeHTTP(w, r)
return
}
// Cross-namespace access control for namespace gateways
// The gateway's ClientNamespace determines which namespace this gateway serves
gatewayNamespace := "default"
if g.cfg != nil && g.cfg.ClientNamespace != "" {
gatewayNamespace = strings.TrimSpace(g.cfg.ClientNamespace)
}
// Get user's namespace from context (derived from API key/JWT)
userNamespace := ""
if v := r.Context().Value(CtxKeyNamespaceOverride); v != nil {
if s, ok := v.(string); ok {
userNamespace = strings.TrimSpace(s)
}
}
// For non-default namespace gateways, the API key must belong to this namespace
// This enforces physical isolation: alice's gateway only accepts alice's API keys
if gatewayNamespace != "default" && userNamespace != "" && userNamespace != gatewayNamespace {
g.logger.ComponentWarn(logging.ComponentGeneral, "cross-namespace access denied",
zap.String("user_namespace", userNamespace),
zap.String("gateway_namespace", gatewayNamespace),
zap.String("path", r.URL.Path),
)
writeError(w, http.StatusForbidden, "API key does not belong to this namespace")
return
}
// Only enforce ownership for specific resource paths
if !requiresNamespaceOwnership(r.URL.Path) { if !requiresNamespaceOwnership(r.URL.Path) {
next.ServeHTTP(w, r) next.ServeHTTP(w, r)
return return
@ -433,8 +469,14 @@ func getClientIP(r *http.Request) string {
return host return host
} }
// domainRoutingMiddleware handles requests to deployment domains // domainRoutingMiddleware handles requests to deployment domains and namespace gateways
// This must come BEFORE auth middleware so deployment domains work without API keys // This must come BEFORE auth middleware so deployment domains work without API keys
//
// Domain routing patterns:
// - ns-{namespace}.{baseDomain} -> Namespace gateway (proxy to namespace cluster)
// - {name}-{random}.{baseDomain} -> Deployment domain
// - {name}.{baseDomain} -> Deployment domain (legacy)
// - {name}.node-xxx.{baseDomain} -> Legacy format (deprecated, returns 404 for new deployments)
func (g *Gateway) domainRoutingMiddleware(next http.Handler) http.Handler { func (g *Gateway) domainRoutingMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
host := strings.Split(r.Host, ":")[0] // Strip port host := strings.Split(r.Host, ":")[0] // Strip port
@ -446,7 +488,7 @@ func (g *Gateway) domainRoutingMiddleware(next http.Handler) http.Handler {
} }
// Only process base domain and its subdomains // Only process base domain and its subdomains
if !strings.HasSuffix(host, "."+baseDomain) { if !strings.HasSuffix(host, "."+baseDomain) && host != baseDomain {
next.ServeHTTP(w, r) next.ServeHTTP(w, r)
return return
} }
@ -457,6 +499,18 @@ func (g *Gateway) domainRoutingMiddleware(next http.Handler) http.Handler {
return return
} }
// Check for namespace gateway domain: ns-{namespace}.{baseDomain}
suffix := "." + baseDomain
if strings.HasSuffix(host, suffix) {
subdomain := strings.TrimSuffix(host, suffix)
if strings.HasPrefix(subdomain, "ns-") {
// This is a namespace gateway request
namespaceName := strings.TrimPrefix(subdomain, "ns-")
g.handleNamespaceGatewayRequest(w, r, namespaceName)
return
}
}
// Check if deployment handlers are available // Check if deployment handlers are available
if g.deploymentService == nil || g.staticHandler == nil { if g.deploymentService == nil || g.staticHandler == nil {
next.ServeHTTP(w, r) next.ServeHTTP(w, r)
@ -470,7 +524,7 @@ func (g *Gateway) domainRoutingMiddleware(next http.Handler) http.Handler {
return return
} }
if deployment == nil { if deployment == nil {
// Domain matches .orama.network but no deployment found // Domain matches .{baseDomain} but no deployment found
http.NotFound(w, r) http.NotFound(w, r)
return return
} }
@ -490,9 +544,112 @@ func (g *Gateway) domainRoutingMiddleware(next http.Handler) http.Handler {
}) })
} }
// handleNamespaceGatewayRequest proxies requests to a namespace's dedicated gateway cluster
// This enables physical isolation where each namespace has its own RQLite, Olric, and Gateway
func (g *Gateway) handleNamespaceGatewayRequest(w http.ResponseWriter, r *http.Request, namespaceName string) {
// Look up namespace cluster gateway IPs from DNS records
db := g.client.Database()
internalCtx := client.WithInternalAuth(r.Context())
baseDomain := "dbrs.space"
if g.cfg != nil && g.cfg.BaseDomain != "" {
baseDomain = g.cfg.BaseDomain
}
// Query DNS records for the namespace gateway
fqdn := "ns-" + namespaceName + "." + baseDomain + "."
query := `SELECT value FROM dns_records WHERE fqdn = ? AND record_type = 'A' AND is_active = TRUE ORDER BY RANDOM() LIMIT 1`
result, err := db.Query(internalCtx, query, fqdn)
if err != nil || result == nil || len(result.Rows) == 0 {
// No gateway found for this namespace
g.logger.ComponentWarn(logging.ComponentGeneral, "namespace gateway not found",
zap.String("namespace", namespaceName),
zap.String("fqdn", fqdn),
)
http.Error(w, "Namespace gateway not found", http.StatusNotFound)
return
}
gatewayIP := getString(result.Rows[0][0])
if gatewayIP == "" {
http.Error(w, "Namespace gateway not available", http.StatusServiceUnavailable)
return
}
// Get the gateway port from namespace_port_allocations
// Gateway HTTP port is port_start + 4
portQuery := `
SELECT npa.gateway_http_port
FROM namespace_port_allocations npa
JOIN namespace_clusters nc ON npa.namespace_cluster_id = nc.id
WHERE nc.namespace_name = ?
LIMIT 1
`
portResult, err := db.Query(internalCtx, portQuery, namespaceName)
gatewayPort := 10004 // Default to first namespace's gateway port
if err == nil && portResult != nil && len(portResult.Rows) > 0 {
if p := getInt(portResult.Rows[0][0]); p > 0 {
gatewayPort = p
}
}
// Proxy request to the namespace gateway
targetURL := "http://" + gatewayIP + ":" + strconv.Itoa(gatewayPort) + r.URL.Path
if r.URL.RawQuery != "" {
targetURL += "?" + r.URL.RawQuery
}
proxyReq, err := http.NewRequest(r.Method, targetURL, r.Body)
if err != nil {
g.logger.ComponentError(logging.ComponentGeneral, "failed to create namespace gateway proxy request",
zap.String("namespace", namespaceName),
zap.Error(err),
)
http.Error(w, "Internal server error", http.StatusInternalServerError)
return
}
// Copy headers
for key, values := range r.Header {
for _, value := range values {
proxyReq.Header.Add(key, value)
}
}
proxyReq.Header.Set("X-Forwarded-For", getClientIP(r))
proxyReq.Header.Set("X-Forwarded-Proto", "https")
proxyReq.Header.Set("X-Forwarded-Host", r.Host)
proxyReq.Header.Set("X-Original-Host", r.Host)
// Execute proxy request
httpClient := &http.Client{Timeout: 30 * time.Second}
resp, err := httpClient.Do(proxyReq)
if err != nil {
g.logger.ComponentError(logging.ComponentGeneral, "namespace gateway proxy request failed",
zap.String("namespace", namespaceName),
zap.String("target", gatewayIP),
zap.Error(err),
)
http.Error(w, "Namespace gateway unavailable", http.StatusServiceUnavailable)
return
}
defer resp.Body.Close()
// Copy response headers
for key, values := range resp.Header {
for _, value := range values {
w.Header().Add(key, value)
}
}
// Write status code and body
w.WriteHeader(resp.StatusCode)
io.Copy(w, resp.Body)
}
// getDeploymentByDomain looks up a deployment by its domain // getDeploymentByDomain looks up a deployment by its domain
// Supports formats like: // Supports formats like:
// - {name}.{baseDomain} (e.g., myapp.dbrs.space) - primary format // - {name}-{random}.{baseDomain} (e.g., myapp-f3o4if.dbrs.space) - new format with random suffix
// - {name}.{baseDomain} (e.g., myapp.dbrs.space) - legacy format (backwards compatibility)
// - {name}.node-{shortID}.{baseDomain} (legacy format for backwards compatibility) // - {name}.node-{shortID}.{baseDomain} (legacy format for backwards compatibility)
// - custom domains via deployment_domains table // - custom domains via deployment_domains table
func (g *Gateway) getDeploymentByDomain(ctx context.Context, domain string) (*deployments.Deployment, error) { func (g *Gateway) getDeploymentByDomain(ctx context.Context, domain string) (*deployments.Deployment, error) {
@ -512,25 +669,28 @@ func (g *Gateway) getDeploymentByDomain(ctx context.Context, domain string) (*de
db := g.client.Database() db := g.client.Database()
internalCtx := client.WithInternalAuth(ctx) internalCtx := client.WithInternalAuth(ctx)
// Parse domain to extract deployment name // Parse domain to extract deployment subdomain/name
suffix := "." + baseDomain suffix := "." + baseDomain
if strings.HasSuffix(domain, suffix) { if strings.HasSuffix(domain, suffix) {
subdomain := strings.TrimSuffix(domain, suffix) subdomain := strings.TrimSuffix(domain, suffix)
parts := strings.Split(subdomain, ".") parts := strings.Split(subdomain, ".")
// Primary format: {name}.{baseDomain} (e.g., myapp.dbrs.space) // Primary format: {subdomain}.{baseDomain} (e.g., myapp-f3o4if.dbrs.space)
// The subdomain can be either:
// - {name}-{random} (new format)
// - {name} (legacy format)
if len(parts) == 1 { if len(parts) == 1 {
deploymentName := parts[0] subdomainOrName := parts[0]
// Query by name // First, try to find by subdomain (new format: name-random)
query := ` query := `
SELECT id, namespace, name, type, port, content_cid, status, home_node_id SELECT id, namespace, name, type, port, content_cid, status, home_node_id, subdomain
FROM deployments FROM deployments
WHERE name = ? WHERE subdomain = ?
AND status = 'active' AND status = 'active'
LIMIT 1 LIMIT 1
` `
result, err := db.Query(internalCtx, query, deploymentName) result, err := db.Query(internalCtx, query, subdomainOrName)
if err == nil && len(result.Rows) > 0 { if err == nil && len(result.Rows) > 0 {
row := result.Rows[0] row := result.Rows[0]
return &deployments.Deployment{ return &deployments.Deployment{
@ -542,6 +702,31 @@ func (g *Gateway) getDeploymentByDomain(ctx context.Context, domain string) (*de
ContentCID: getString(row[5]), ContentCID: getString(row[5]),
Status: deployments.DeploymentStatus(getString(row[6])), Status: deployments.DeploymentStatus(getString(row[6])),
HomeNodeID: getString(row[7]), HomeNodeID: getString(row[7]),
Subdomain: getString(row[8]),
}, nil
}
// Fallback: try by name for legacy deployments (without random suffix)
query = `
SELECT id, namespace, name, type, port, content_cid, status, home_node_id, subdomain
FROM deployments
WHERE name = ?
AND status = 'active'
LIMIT 1
`
result, err = db.Query(internalCtx, query, subdomainOrName)
if err == nil && len(result.Rows) > 0 {
row := result.Rows[0]
return &deployments.Deployment{
ID: getString(row[0]),
Namespace: getString(row[1]),
Name: getString(row[2]),
Type: deployments.DeploymentType(getString(row[3])),
Port: getInt(row[4]),
ContentCID: getString(row[5]),
Status: deployments.DeploymentStatus(getString(row[6])),
HomeNodeID: getString(row[7]),
Subdomain: getString(row[8]),
}, nil }, nil
} }
} }

View File

@ -0,0 +1,659 @@
package namespace
import (
"context"
"encoding/json"
"fmt"
"time"
"github.com/DeBrosOfficial/network/pkg/client"
"github.com/DeBrosOfficial/network/pkg/gateway"
"github.com/DeBrosOfficial/network/pkg/olric"
"github.com/DeBrosOfficial/network/pkg/rqlite"
rqliteClient "github.com/DeBrosOfficial/network/pkg/rqlite"
"github.com/google/uuid"
"go.uber.org/zap"
)
// ClusterManager orchestrates namespace cluster provisioning and lifecycle management.
// It coordinates the creation and teardown of RQLite, Olric, and Gateway instances
// for each namespace's dedicated cluster.
type ClusterManager struct {
db rqliteClient.Client
portAllocator *NamespacePortAllocator
nodeSelector *ClusterNodeSelector
rqliteSpawner *rqlite.InstanceSpawner
olricSpawner *olric.InstanceSpawner
gatewaySpawner *gateway.InstanceSpawner
dnsManager *DNSRecordManager
baseDomain string
baseDataDir string // Base directory for namespace data (e.g., ~/.orama/data/namespaces)
logger *zap.Logger
}
// ClusterManagerConfig holds configuration for the ClusterManager
type ClusterManagerConfig struct {
BaseDomain string // e.g., "devnet-orama.network"
BaseDataDir string // e.g., "~/.orama/data/namespaces"
}
// NewClusterManager creates a new cluster manager
func NewClusterManager(
db rqliteClient.Client,
cfg ClusterManagerConfig,
logger *zap.Logger,
) *ClusterManager {
portAllocator := NewNamespacePortAllocator(db, logger)
return &ClusterManager{
db: db,
portAllocator: portAllocator,
nodeSelector: NewClusterNodeSelector(db, portAllocator, logger),
rqliteSpawner: rqlite.NewInstanceSpawner(cfg.BaseDataDir, logger),
olricSpawner: olric.NewInstanceSpawner(cfg.BaseDataDir, logger),
gatewaySpawner: gateway.NewInstanceSpawner(cfg.BaseDataDir, logger),
dnsManager: NewDNSRecordManager(db, cfg.BaseDomain, logger),
baseDomain: cfg.BaseDomain,
baseDataDir: cfg.BaseDataDir,
logger: logger.With(zap.String("component", "cluster-manager")),
}
}
// ProvisionCluster provisions a complete namespace cluster (RQLite + Olric + Gateway).
// This is an asynchronous operation that returns immediately with a cluster ID.
// Use GetClusterStatus to poll for completion.
func (cm *ClusterManager) ProvisionCluster(ctx context.Context, namespaceID int, namespaceName, provisionedBy string) (*NamespaceCluster, error) {
internalCtx := client.WithInternalAuth(ctx)
// Check if cluster already exists
existing, err := cm.GetClusterByNamespaceID(ctx, namespaceID)
if err == nil && existing != nil {
if existing.Status == ClusterStatusReady {
return existing, nil
}
if existing.Status == ClusterStatusProvisioning {
return existing, nil // Already provisioning
}
// If failed or deprovisioning, allow re-provisioning
}
// Create cluster record
clusterID := uuid.New().String()
cluster := &NamespaceCluster{
ID: clusterID,
NamespaceID: namespaceID,
NamespaceName: namespaceName,
Status: ClusterStatusProvisioning,
RQLiteNodeCount: DefaultRQLiteNodeCount,
OlricNodeCount: DefaultOlricNodeCount,
GatewayNodeCount: DefaultGatewayNodeCount,
ProvisionedBy: provisionedBy,
ProvisionedAt: time.Now(),
}
// Insert cluster record
insertQuery := `
INSERT INTO namespace_clusters (
id, namespace_id, namespace_name, status,
rqlite_node_count, olric_node_count, gateway_node_count,
provisioned_by, provisioned_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
`
_, err = cm.db.Exec(internalCtx, insertQuery,
cluster.ID,
cluster.NamespaceID,
cluster.NamespaceName,
string(cluster.Status),
cluster.RQLiteNodeCount,
cluster.OlricNodeCount,
cluster.GatewayNodeCount,
cluster.ProvisionedBy,
cluster.ProvisionedAt,
)
if err != nil {
return nil, &ClusterError{
Message: "failed to create cluster record",
Cause: err,
}
}
// Log provisioning started event
cm.logEvent(internalCtx, clusterID, EventProvisioningStarted, "", "Cluster provisioning started", nil)
// Start async provisioning
go cm.doProvisioning(context.Background(), cluster)
return cluster, nil
}
// doProvisioning performs the actual cluster provisioning asynchronously
func (cm *ClusterManager) doProvisioning(ctx context.Context, cluster *NamespaceCluster) {
internalCtx := client.WithInternalAuth(ctx)
cm.logger.Info("Starting cluster provisioning",
zap.String("cluster_id", cluster.ID),
zap.String("namespace", cluster.NamespaceName),
)
// Step 1: Select nodes for the cluster
selectedNodes, err := cm.nodeSelector.SelectNodesForCluster(internalCtx, DefaultRQLiteNodeCount)
if err != nil {
cm.failCluster(internalCtx, cluster.ID, "Failed to select nodes: "+err.Error())
return
}
nodeIDs := make([]string, len(selectedNodes))
for i, n := range selectedNodes {
nodeIDs[i] = n.NodeID
}
cm.logEvent(internalCtx, cluster.ID, EventNodesSelected, "", "Selected nodes for cluster", map[string]interface{}{
"node_ids": nodeIDs,
})
// Step 2: Allocate port blocks on each node
portBlocks := make([]*PortBlock, len(selectedNodes))
for i, node := range selectedNodes {
block, err := cm.portAllocator.AllocatePortBlock(internalCtx, node.NodeID, cluster.ID)
if err != nil {
cm.failCluster(internalCtx, cluster.ID, fmt.Sprintf("Failed to allocate ports on node %s: %v", node.NodeID, err))
// Cleanup already allocated ports
for j := 0; j < i; j++ {
_ = cm.portAllocator.DeallocatePortBlock(internalCtx, cluster.ID, selectedNodes[j].NodeID)
}
return
}
portBlocks[i] = block
cm.logEvent(internalCtx, cluster.ID, EventPortsAllocated, node.NodeID,
fmt.Sprintf("Allocated ports %d-%d", block.PortStart, block.PortEnd), nil)
}
// Step 3: Start RQLite instances
// First node is the leader, others join it
rqliteInstances := make([]*rqlite.RQLiteInstance, len(selectedNodes))
// Start leader first
leaderNode := selectedNodes[0]
leaderPorts := portBlocks[0]
leaderConfig := rqlite.InstanceConfig{
Namespace: cluster.NamespaceName,
NodeID: leaderNode.NodeID,
HTTPPort: leaderPorts.RQLiteHTTPPort,
RaftPort: leaderPorts.RQLiteRaftPort,
HTTPAdvAddress: fmt.Sprintf("%s:%d", leaderNode.IPAddress, leaderPorts.RQLiteHTTPPort),
RaftAdvAddress: fmt.Sprintf("%s:%d", leaderNode.IPAddress, leaderPorts.RQLiteRaftPort),
IsLeader: true,
}
leaderInstance, err := cm.rqliteSpawner.SpawnInstance(internalCtx, leaderConfig)
if err != nil {
cm.failCluster(internalCtx, cluster.ID, fmt.Sprintf("Failed to start RQLite leader: %v", err))
cm.cleanupOnFailure(internalCtx, cluster.ID, selectedNodes, portBlocks)
return
}
rqliteInstances[0] = leaderInstance
cm.logEvent(internalCtx, cluster.ID, EventRQLiteStarted, leaderNode.NodeID, "RQLite leader started", nil)
// Create cluster node record for leader
cm.createClusterNodeRecord(internalCtx, cluster.ID, leaderNode.NodeID, NodeRoleRQLiteLeader, leaderPorts, leaderInstance.PID)
// Start followers and join them to leader
leaderJoinAddr := leaderInstance.AdvertisedDSN()
for i := 1; i < len(selectedNodes); i++ {
node := selectedNodes[i]
ports := portBlocks[i]
followerConfig := rqlite.InstanceConfig{
Namespace: cluster.NamespaceName,
NodeID: node.NodeID,
HTTPPort: ports.RQLiteHTTPPort,
RaftPort: ports.RQLiteRaftPort,
HTTPAdvAddress: fmt.Sprintf("%s:%d", node.IPAddress, ports.RQLiteHTTPPort),
RaftAdvAddress: fmt.Sprintf("%s:%d", node.IPAddress, ports.RQLiteRaftPort),
JoinAddresses: []string{leaderJoinAddr},
IsLeader: false,
}
followerInstance, err := cm.rqliteSpawner.SpawnInstance(internalCtx, followerConfig)
if err != nil {
cm.failCluster(internalCtx, cluster.ID, fmt.Sprintf("Failed to start RQLite follower on node %s: %v", node.NodeID, err))
cm.cleanupOnFailure(internalCtx, cluster.ID, selectedNodes, portBlocks)
return
}
rqliteInstances[i] = followerInstance
cm.logEvent(internalCtx, cluster.ID, EventRQLiteJoined, node.NodeID, "RQLite follower joined cluster", nil)
cm.createClusterNodeRecord(internalCtx, cluster.ID, node.NodeID, NodeRoleRQLiteFollower, ports, followerInstance.PID)
}
cm.logEvent(internalCtx, cluster.ID, EventRQLiteLeaderElected, leaderNode.NodeID, "RQLite cluster formed", nil)
// Step 4: Start Olric instances
// Collect all memberlist addresses for peer discovery
olricPeers := make([]string, len(selectedNodes))
for i, node := range selectedNodes {
olricPeers[i] = fmt.Sprintf("%s:%d", node.IPAddress, portBlocks[i].OlricMemberlistPort)
}
for i, node := range selectedNodes {
ports := portBlocks[i]
olricConfig := olric.InstanceConfig{
Namespace: cluster.NamespaceName,
NodeID: node.NodeID,
HTTPPort: ports.OlricHTTPPort,
MemberlistPort: ports.OlricMemberlistPort,
BindAddr: "0.0.0.0",
AdvertiseAddr: node.IPAddress,
PeerAddresses: olricPeers,
}
_, err := cm.olricSpawner.SpawnInstance(internalCtx, olricConfig)
if err != nil {
cm.failCluster(internalCtx, cluster.ID, fmt.Sprintf("Failed to start Olric on node %s: %v", node.NodeID, err))
cm.cleanupOnFailure(internalCtx, cluster.ID, selectedNodes, portBlocks)
return
}
cm.logEvent(internalCtx, cluster.ID, EventOlricStarted, node.NodeID, "Olric instance started", nil)
// Update cluster node record with Olric role
cm.updateClusterNodeOlricStatus(internalCtx, cluster.ID, node.NodeID)
}
cm.logEvent(internalCtx, cluster.ID, EventOlricJoined, "", "Olric cluster formed", nil)
// Step 5: Start Gateway instances
// Build Olric server list for gateway config
olricServers := make([]string, len(selectedNodes))
for i, node := range selectedNodes {
olricServers[i] = fmt.Sprintf("%s:%d", node.IPAddress, portBlocks[i].OlricHTTPPort)
}
for i, node := range selectedNodes {
ports := portBlocks[i]
gatewayConfig := gateway.InstanceConfig{
Namespace: cluster.NamespaceName,
NodeID: node.NodeID,
HTTPPort: ports.GatewayHTTPPort,
BaseDomain: cm.baseDomain,
RQLiteDSN: fmt.Sprintf("http://%s:%d", node.IPAddress, ports.RQLiteHTTPPort),
OlricServers: olricServers,
NodePeerID: node.NodeID, // Use node ID as peer ID
DataDir: cm.baseDataDir,
}
_, err := cm.gatewaySpawner.SpawnInstance(internalCtx, gatewayConfig)
if err != nil {
cm.failCluster(internalCtx, cluster.ID, fmt.Sprintf("Failed to start Gateway on node %s: %v", node.NodeID, err))
cm.cleanupOnFailure(internalCtx, cluster.ID, selectedNodes, portBlocks)
return
}
cm.logEvent(internalCtx, cluster.ID, EventGatewayStarted, node.NodeID, "Gateway instance started", nil)
// Update cluster node record with Gateway role
cm.updateClusterNodeGatewayStatus(internalCtx, cluster.ID, node.NodeID)
}
// Step 6: Create DNS records for namespace gateway
nodeIPs := make([]string, len(selectedNodes))
for i, node := range selectedNodes {
nodeIPs[i] = node.IPAddress
}
if err := cm.dnsManager.CreateNamespaceRecords(internalCtx, cluster.NamespaceName, nodeIPs); err != nil {
cm.failCluster(internalCtx, cluster.ID, fmt.Sprintf("Failed to create DNS records: %v", err))
cm.cleanupOnFailure(internalCtx, cluster.ID, selectedNodes, portBlocks)
return
}
cm.logEvent(internalCtx, cluster.ID, EventDNSCreated, "", "DNS records created", map[string]interface{}{
"domain": fmt.Sprintf("ns-%s.%s", cluster.NamespaceName, cm.baseDomain),
"node_ips": nodeIPs,
})
// Mark cluster as ready
now := time.Now()
updateQuery := `UPDATE namespace_clusters SET status = ?, ready_at = ? WHERE id = ?`
_, err = cm.db.Exec(internalCtx, updateQuery, string(ClusterStatusReady), now, cluster.ID)
if err != nil {
cm.logger.Error("Failed to update cluster status to ready",
zap.String("cluster_id", cluster.ID),
zap.Error(err),
)
}
cm.logEvent(internalCtx, cluster.ID, EventClusterReady, "", "Cluster is ready", nil)
cm.logger.Info("Cluster provisioning completed",
zap.String("cluster_id", cluster.ID),
zap.String("namespace", cluster.NamespaceName),
)
}
// DeprovisionCluster tears down all services for a namespace cluster
func (cm *ClusterManager) DeprovisionCluster(ctx context.Context, clusterID string) error {
internalCtx := client.WithInternalAuth(ctx)
// Get cluster info
cluster, err := cm.GetCluster(ctx, clusterID)
if err != nil {
return err
}
cm.logger.Info("Starting cluster deprovisioning",
zap.String("cluster_id", clusterID),
zap.String("namespace", cluster.NamespaceName),
)
// Update status to deprovisioning
updateQuery := `UPDATE namespace_clusters SET status = ? WHERE id = ?`
_, _ = cm.db.Exec(internalCtx, updateQuery, string(ClusterStatusDeprovisioning), clusterID)
cm.logEvent(internalCtx, clusterID, EventDeprovisionStarted, "", "Cluster deprovisioning started", nil)
// Stop all gateway instances
if err := cm.gatewaySpawner.StopAllInstances(ctx, cluster.NamespaceName); err != nil {
cm.logger.Warn("Error stopping gateway instances", zap.Error(err))
}
// Stop all olric instances
if err := cm.olricSpawner.StopAllInstances(ctx, cluster.NamespaceName); err != nil {
cm.logger.Warn("Error stopping olric instances", zap.Error(err))
}
// Stop all rqlite instances
if err := cm.rqliteSpawner.StopAllInstances(ctx, cluster.NamespaceName); err != nil {
cm.logger.Warn("Error stopping rqlite instances", zap.Error(err))
}
// Delete DNS records
if err := cm.dnsManager.DeleteNamespaceRecords(ctx, cluster.NamespaceName); err != nil {
cm.logger.Warn("Error deleting DNS records", zap.Error(err))
}
// Deallocate all ports
if err := cm.portAllocator.DeallocateAllPortBlocks(ctx, clusterID); err != nil {
cm.logger.Warn("Error deallocating ports", zap.Error(err))
}
// Delete cluster node records
deleteNodesQuery := `DELETE FROM namespace_cluster_nodes WHERE namespace_cluster_id = ?`
_, _ = cm.db.Exec(internalCtx, deleteNodesQuery, clusterID)
// Delete cluster record
deleteClusterQuery := `DELETE FROM namespace_clusters WHERE id = ?`
_, err = cm.db.Exec(internalCtx, deleteClusterQuery, clusterID)
if err != nil {
return &ClusterError{
Message: "failed to delete cluster record",
Cause: err,
}
}
cm.logEvent(internalCtx, clusterID, EventDeprovisioned, "", "Cluster deprovisioned", nil)
cm.logger.Info("Cluster deprovisioning completed",
zap.String("cluster_id", clusterID),
zap.String("namespace", cluster.NamespaceName),
)
return nil
}
// GetCluster retrieves a cluster by ID
func (cm *ClusterManager) GetCluster(ctx context.Context, clusterID string) (*NamespaceCluster, error) {
internalCtx := client.WithInternalAuth(ctx)
var clusters []NamespaceCluster
query := `SELECT * FROM namespace_clusters WHERE id = ? LIMIT 1`
err := cm.db.Query(internalCtx, &clusters, query, clusterID)
if err != nil {
return nil, &ClusterError{
Message: "failed to query cluster",
Cause: err,
}
}
if len(clusters) == 0 {
return nil, ErrClusterNotFound
}
return &clusters[0], nil
}
// GetClusterByNamespaceID retrieves a cluster by namespace ID
func (cm *ClusterManager) GetClusterByNamespaceID(ctx context.Context, namespaceID int) (*NamespaceCluster, error) {
internalCtx := client.WithInternalAuth(ctx)
var clusters []NamespaceCluster
query := `SELECT * FROM namespace_clusters WHERE namespace_id = ? LIMIT 1`
err := cm.db.Query(internalCtx, &clusters, query, namespaceID)
if err != nil {
return nil, &ClusterError{
Message: "failed to query cluster",
Cause: err,
}
}
if len(clusters) == 0 {
return nil, ErrClusterNotFound
}
return &clusters[0], nil
}
// GetClusterByNamespaceName retrieves a cluster by namespace name
func (cm *ClusterManager) GetClusterByNamespaceName(ctx context.Context, namespaceName string) (*NamespaceCluster, error) {
internalCtx := client.WithInternalAuth(ctx)
var clusters []NamespaceCluster
query := `SELECT * FROM namespace_clusters WHERE namespace_name = ? LIMIT 1`
err := cm.db.Query(internalCtx, &clusters, query, namespaceName)
if err != nil {
return nil, &ClusterError{
Message: "failed to query cluster",
Cause: err,
}
}
if len(clusters) == 0 {
return nil, ErrClusterNotFound
}
return &clusters[0], nil
}
// GetClusterStatus returns the detailed provisioning status of a cluster
func (cm *ClusterManager) GetClusterStatus(ctx context.Context, clusterID string) (*ClusterProvisioningStatus, error) {
cluster, err := cm.GetCluster(ctx, clusterID)
if err != nil {
return nil, err
}
// Get cluster nodes
internalCtx := client.WithInternalAuth(ctx)
var nodes []ClusterNode
nodesQuery := `SELECT * FROM namespace_cluster_nodes WHERE namespace_cluster_id = ?`
_ = cm.db.Query(internalCtx, &nodes, nodesQuery, clusterID)
// Determine component readiness
rqliteReady := false
olricReady := false
gatewayReady := false
rqliteCount := 0
olricCount := 0
gatewayCount := 0
for _, node := range nodes {
if node.Status == NodeStatusRunning {
switch node.Role {
case NodeRoleRQLiteLeader, NodeRoleRQLiteFollower:
rqliteCount++
case NodeRoleOlric:
olricCount++
case NodeRoleGateway:
gatewayCount++
}
}
}
// Consider ready if we have the expected number of each type
rqliteReady = rqliteCount >= cluster.RQLiteNodeCount
olricReady = olricCount >= cluster.OlricNodeCount
gatewayReady = gatewayCount >= cluster.GatewayNodeCount
// DNS is ready if cluster status is ready
dnsReady := cluster.Status == ClusterStatusReady
nodeIDs := make([]string, len(nodes))
for i, n := range nodes {
nodeIDs[i] = n.NodeID
}
status := &ClusterProvisioningStatus{
ClusterID: cluster.ID,
Namespace: cluster.NamespaceName,
Status: cluster.Status,
Nodes: nodeIDs,
RQLiteReady: rqliteReady,
OlricReady: olricReady,
GatewayReady: gatewayReady,
DNSReady: dnsReady,
Error: cluster.ErrorMessage,
CreatedAt: cluster.ProvisionedAt,
ReadyAt: cluster.ReadyAt,
}
return status, nil
}
// failCluster marks a cluster as failed with an error message
func (cm *ClusterManager) failCluster(ctx context.Context, clusterID, errorMsg string) {
cm.logger.Error("Cluster provisioning failed",
zap.String("cluster_id", clusterID),
zap.String("error", errorMsg),
)
updateQuery := `UPDATE namespace_clusters SET status = ?, error_message = ?, retry_count = retry_count + 1 WHERE id = ?`
_, _ = cm.db.Exec(ctx, updateQuery, string(ClusterStatusFailed), errorMsg, clusterID)
cm.logEvent(ctx, clusterID, EventClusterFailed, "", errorMsg, nil)
}
// cleanupOnFailure cleans up partial resources after a provisioning failure
func (cm *ClusterManager) cleanupOnFailure(ctx context.Context, clusterID string, nodes []NodeCapacity, portBlocks []*PortBlock) {
// Get namespace name from first port block
var namespaceName string
if len(portBlocks) > 0 {
// Query to get namespace name from cluster
var clusters []NamespaceCluster
query := `SELECT namespace_name FROM namespace_clusters WHERE id = ? LIMIT 1`
if err := cm.db.Query(ctx, &clusters, query, clusterID); err == nil && len(clusters) > 0 {
namespaceName = clusters[0].NamespaceName
}
}
if namespaceName != "" {
// Stop any started instances
_ = cm.gatewaySpawner.StopAllInstances(ctx, namespaceName)
_ = cm.olricSpawner.StopAllInstances(ctx, namespaceName)
_ = cm.rqliteSpawner.StopAllInstances(ctx, namespaceName)
}
// Deallocate ports
for _, node := range nodes {
_ = cm.portAllocator.DeallocatePortBlock(ctx, clusterID, node.NodeID)
}
// Delete cluster node records
deleteQuery := `DELETE FROM namespace_cluster_nodes WHERE namespace_cluster_id = ?`
_, _ = cm.db.Exec(ctx, deleteQuery, clusterID)
}
// logEvent logs a cluster lifecycle event
func (cm *ClusterManager) logEvent(ctx context.Context, clusterID string, eventType EventType, nodeID, message string, metadata map[string]interface{}) {
eventID := uuid.New().String()
var metadataJSON string
if metadata != nil {
data, _ := json.Marshal(metadata)
metadataJSON = string(data)
}
insertQuery := `
INSERT INTO namespace_cluster_events (id, namespace_cluster_id, event_type, node_id, message, metadata, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?)
`
_, _ = cm.db.Exec(ctx, insertQuery, eventID, clusterID, string(eventType), nodeID, message, metadataJSON, time.Now())
cm.logger.Debug("Cluster event logged",
zap.String("cluster_id", clusterID),
zap.String("event_type", string(eventType)),
zap.String("node_id", nodeID),
zap.String("message", message),
)
}
// createClusterNodeRecord creates a record for a node in the cluster
func (cm *ClusterManager) createClusterNodeRecord(ctx context.Context, clusterID, nodeID string, role NodeRole, ports *PortBlock, pid int) {
recordID := uuid.New().String()
now := time.Now()
insertQuery := `
INSERT INTO namespace_cluster_nodes (
id, namespace_cluster_id, node_id, role,
rqlite_http_port, rqlite_raft_port, olric_http_port, olric_memberlist_port, gateway_http_port,
status, process_pid, created_at, updated_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`
_, _ = cm.db.Exec(ctx, insertQuery,
recordID,
clusterID,
nodeID,
string(role),
ports.RQLiteHTTPPort,
ports.RQLiteRaftPort,
ports.OlricHTTPPort,
ports.OlricMemberlistPort,
ports.GatewayHTTPPort,
string(NodeStatusRunning),
pid,
now,
now,
)
}
// updateClusterNodeOlricStatus updates a node record to indicate Olric is running
func (cm *ClusterManager) updateClusterNodeOlricStatus(ctx context.Context, clusterID, nodeID string) {
// Check if Olric role record exists
var existing []ClusterNode
checkQuery := `SELECT id FROM namespace_cluster_nodes WHERE namespace_cluster_id = ? AND node_id = ? AND role = ?`
_ = cm.db.Query(ctx, &existing, checkQuery, clusterID, nodeID, string(NodeRoleOlric))
if len(existing) == 0 {
// Create new record for Olric role
recordID := uuid.New().String()
now := time.Now()
insertQuery := `
INSERT INTO namespace_cluster_nodes (
id, namespace_cluster_id, node_id, role, status, created_at, updated_at
) VALUES (?, ?, ?, ?, ?, ?, ?)
`
_, _ = cm.db.Exec(ctx, insertQuery, recordID, clusterID, nodeID, string(NodeRoleOlric), string(NodeStatusRunning), now, now)
}
}
// updateClusterNodeGatewayStatus updates a node record to indicate Gateway is running
func (cm *ClusterManager) updateClusterNodeGatewayStatus(ctx context.Context, clusterID, nodeID string) {
// Check if Gateway role record exists
var existing []ClusterNode
checkQuery := `SELECT id FROM namespace_cluster_nodes WHERE namespace_cluster_id = ? AND node_id = ? AND role = ?`
_ = cm.db.Query(ctx, &existing, checkQuery, clusterID, nodeID, string(NodeRoleGateway))
if len(existing) == 0 {
// Create new record for Gateway role
recordID := uuid.New().String()
now := time.Now()
insertQuery := `
INSERT INTO namespace_cluster_nodes (
id, namespace_cluster_id, node_id, role, status, created_at, updated_at
) VALUES (?, ?, ?, ?, ?, ?, ?)
`
_, _ = cm.db.Exec(ctx, insertQuery, recordID, clusterID, nodeID, string(NodeRoleGateway), string(NodeStatusRunning), now, now)
}
}

View File

@ -0,0 +1,395 @@
package namespace
import (
"testing"
"time"
"go.uber.org/zap"
)
func TestClusterManagerConfig(t *testing.T) {
cfg := ClusterManagerConfig{
BaseDomain: "devnet-orama.network",
BaseDataDir: "~/.orama/data/namespaces",
}
if cfg.BaseDomain != "devnet-orama.network" {
t.Errorf("BaseDomain = %s, want devnet-orama.network", cfg.BaseDomain)
}
if cfg.BaseDataDir != "~/.orama/data/namespaces" {
t.Errorf("BaseDataDir = %s, want ~/.orama/data/namespaces", cfg.BaseDataDir)
}
}
func TestNewClusterManager(t *testing.T) {
mockDB := newMockRQLiteClient()
logger := zap.NewNop()
cfg := ClusterManagerConfig{
BaseDomain: "devnet-orama.network",
BaseDataDir: "/tmp/test-namespaces",
}
manager := NewClusterManager(mockDB, cfg, logger)
if manager == nil {
t.Fatal("NewClusterManager returned nil")
}
}
func TestNamespaceCluster_InitialState(t *testing.T) {
now := time.Now()
cluster := &NamespaceCluster{
ID: "test-cluster-id",
NamespaceID: 1,
NamespaceName: "test-namespace",
Status: ClusterStatusProvisioning,
RQLiteNodeCount: DefaultRQLiteNodeCount,
OlricNodeCount: DefaultOlricNodeCount,
GatewayNodeCount: DefaultGatewayNodeCount,
ProvisionedBy: "test-user",
ProvisionedAt: now,
ReadyAt: nil,
ErrorMessage: "",
RetryCount: 0,
}
// Verify initial state
if cluster.Status != ClusterStatusProvisioning {
t.Errorf("Initial status = %s, want %s", cluster.Status, ClusterStatusProvisioning)
}
if cluster.ReadyAt != nil {
t.Error("ReadyAt should be nil initially")
}
if cluster.ErrorMessage != "" {
t.Errorf("ErrorMessage should be empty initially, got %s", cluster.ErrorMessage)
}
if cluster.RetryCount != 0 {
t.Errorf("RetryCount should be 0 initially, got %d", cluster.RetryCount)
}
}
func TestNamespaceCluster_DefaultNodeCounts(t *testing.T) {
cluster := &NamespaceCluster{
RQLiteNodeCount: DefaultRQLiteNodeCount,
OlricNodeCount: DefaultOlricNodeCount,
GatewayNodeCount: DefaultGatewayNodeCount,
}
if cluster.RQLiteNodeCount != 3 {
t.Errorf("RQLiteNodeCount = %d, want 3", cluster.RQLiteNodeCount)
}
if cluster.OlricNodeCount != 3 {
t.Errorf("OlricNodeCount = %d, want 3", cluster.OlricNodeCount)
}
if cluster.GatewayNodeCount != 3 {
t.Errorf("GatewayNodeCount = %d, want 3", cluster.GatewayNodeCount)
}
}
func TestClusterProvisioningStatus_ReadinessFlags(t *testing.T) {
tests := []struct {
name string
rqliteReady bool
olricReady bool
gatewayReady bool
dnsReady bool
expectedAll bool
}{
{"All ready", true, true, true, true, true},
{"RQLite not ready", false, true, true, true, false},
{"Olric not ready", true, false, true, true, false},
{"Gateway not ready", true, true, false, true, false},
{"DNS not ready", true, true, true, false, false},
{"None ready", false, false, false, false, false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
status := &ClusterProvisioningStatus{
RQLiteReady: tt.rqliteReady,
OlricReady: tt.olricReady,
GatewayReady: tt.gatewayReady,
DNSReady: tt.dnsReady,
}
allReady := status.RQLiteReady && status.OlricReady && status.GatewayReady && status.DNSReady
if allReady != tt.expectedAll {
t.Errorf("All ready = %v, want %v", allReady, tt.expectedAll)
}
})
}
}
func TestClusterStatusTransitions(t *testing.T) {
// Test valid status transitions
validTransitions := map[ClusterStatus][]ClusterStatus{
ClusterStatusNone: {ClusterStatusProvisioning},
ClusterStatusProvisioning: {ClusterStatusReady, ClusterStatusFailed},
ClusterStatusReady: {ClusterStatusDegraded, ClusterStatusDeprovisioning},
ClusterStatusDegraded: {ClusterStatusReady, ClusterStatusFailed, ClusterStatusDeprovisioning},
ClusterStatusFailed: {ClusterStatusProvisioning, ClusterStatusDeprovisioning}, // Retry or delete
ClusterStatusDeprovisioning: {ClusterStatusNone},
}
for from, toList := range validTransitions {
for _, to := range toList {
t.Run(string(from)+"->"+string(to), func(t *testing.T) {
// This is a documentation test - it verifies the expected transitions
// The actual enforcement would be in the ClusterManager methods
if from == to && from != ClusterStatusNone {
t.Errorf("Status should not transition to itself: %s -> %s", from, to)
}
})
}
}
}
func TestClusterNode_RoleAssignment(t *testing.T) {
// Test that a node can have multiple roles
roles := []NodeRole{
NodeRoleRQLiteLeader,
NodeRoleRQLiteFollower,
NodeRoleOlric,
NodeRoleGateway,
}
// In the implementation, each node hosts all three services
// but we track them as separate role records
expectedRolesPerNode := 3 // RQLite (leader OR follower), Olric, Gateway
// For a 3-node cluster
nodesCount := 3
totalRoleRecords := nodesCount * expectedRolesPerNode
if totalRoleRecords != 9 {
t.Errorf("Expected 9 role records for 3 nodes, got %d", totalRoleRecords)
}
// Verify all roles are represented
if len(roles) != 4 {
t.Errorf("Expected 4 role types, got %d", len(roles))
}
}
func TestClusterEvent_LifecycleEvents(t *testing.T) {
// Test all lifecycle events are properly ordered
lifecycleOrder := []EventType{
EventProvisioningStarted,
EventNodesSelected,
EventPortsAllocated,
EventRQLiteStarted,
EventRQLiteJoined,
EventRQLiteLeaderElected,
EventOlricStarted,
EventOlricJoined,
EventGatewayStarted,
EventDNSCreated,
EventClusterReady,
}
// Verify we have all the events
if len(lifecycleOrder) != 11 {
t.Errorf("Expected 11 lifecycle events, got %d", len(lifecycleOrder))
}
// Verify they're all unique
seen := make(map[EventType]bool)
for _, event := range lifecycleOrder {
if seen[event] {
t.Errorf("Duplicate event type: %s", event)
}
seen[event] = true
}
}
func TestClusterEvent_FailureEvents(t *testing.T) {
failureEvents := []EventType{
EventClusterDegraded,
EventClusterFailed,
EventNodeFailed,
}
for _, event := range failureEvents {
t.Run(string(event), func(t *testing.T) {
if event == "" {
t.Error("Event type should not be empty")
}
})
}
}
func TestClusterEvent_RecoveryEvents(t *testing.T) {
recoveryEvents := []EventType{
EventNodeRecovered,
}
for _, event := range recoveryEvents {
t.Run(string(event), func(t *testing.T) {
if event == "" {
t.Error("Event type should not be empty")
}
})
}
}
func TestClusterEvent_DeprovisioningEvents(t *testing.T) {
deprovisionEvents := []EventType{
EventDeprovisionStarted,
EventDeprovisioned,
}
for _, event := range deprovisionEvents {
t.Run(string(event), func(t *testing.T) {
if event == "" {
t.Error("Event type should not be empty")
}
})
}
}
func TestProvisioningResponse_PollURL(t *testing.T) {
clusterID := "test-cluster-123"
expectedPollURL := "/v1/namespace/status?id=test-cluster-123"
pollURL := "/v1/namespace/status?id=" + clusterID
if pollURL != expectedPollURL {
t.Errorf("PollURL = %s, want %s", pollURL, expectedPollURL)
}
}
func TestClusterManager_PortAllocationOrder(t *testing.T) {
// Verify the order of port assignments within a block
portStart := 10000
rqliteHTTP := portStart + 0
rqliteRaft := portStart + 1
olricHTTP := portStart + 2
olricMemberlist := portStart + 3
gatewayHTTP := portStart + 4
// Verify order
if rqliteHTTP != 10000 {
t.Errorf("RQLite HTTP port = %d, want 10000", rqliteHTTP)
}
if rqliteRaft != 10001 {
t.Errorf("RQLite Raft port = %d, want 10001", rqliteRaft)
}
if olricHTTP != 10002 {
t.Errorf("Olric HTTP port = %d, want 10002", olricHTTP)
}
if olricMemberlist != 10003 {
t.Errorf("Olric Memberlist port = %d, want 10003", olricMemberlist)
}
if gatewayHTTP != 10004 {
t.Errorf("Gateway HTTP port = %d, want 10004", gatewayHTTP)
}
}
func TestClusterManager_DNSFormat(t *testing.T) {
// Test the DNS domain format for namespace gateways
baseDomain := "devnet-orama.network"
namespaceName := "alice"
expectedDomain := "ns-alice.devnet-orama.network"
actualDomain := "ns-" + namespaceName + "." + baseDomain
if actualDomain != expectedDomain {
t.Errorf("DNS domain = %s, want %s", actualDomain, expectedDomain)
}
}
func TestClusterManager_RQLiteAddresses(t *testing.T) {
// Test RQLite advertised address format
nodeIP := "192.168.1.100"
expectedHTTPAddr := "192.168.1.100:10000"
expectedRaftAddr := "192.168.1.100:10001"
httpAddr := nodeIP + ":10000"
raftAddr := nodeIP + ":10001"
if httpAddr != expectedHTTPAddr {
t.Errorf("HTTP address = %s, want %s", httpAddr, expectedHTTPAddr)
}
if raftAddr != expectedRaftAddr {
t.Errorf("Raft address = %s, want %s", raftAddr, expectedRaftAddr)
}
}
func TestClusterManager_OlricPeerFormat(t *testing.T) {
// Test Olric peer address format
nodes := []struct {
ip string
port int
}{
{"192.168.1.100", 10003},
{"192.168.1.101", 10003},
{"192.168.1.102", 10003},
}
peers := make([]string, len(nodes))
for i, n := range nodes {
peers[i] = n.ip + ":10003"
}
expected := []string{
"192.168.1.100:10003",
"192.168.1.101:10003",
"192.168.1.102:10003",
}
for i, peer := range peers {
if peer != expected[i] {
t.Errorf("Peer[%d] = %s, want %s", i, peer, expected[i])
}
}
}
func TestClusterManager_GatewayRQLiteDSN(t *testing.T) {
// Test the RQLite DSN format used by gateways
nodeIP := "192.168.1.100"
expectedDSN := "http://192.168.1.100:10000"
actualDSN := "http://" + nodeIP + ":10000"
if actualDSN != expectedDSN {
t.Errorf("RQLite DSN = %s, want %s", actualDSN, expectedDSN)
}
}
func TestClusterManager_MinimumNodeRequirement(t *testing.T) {
// A cluster requires at least 3 nodes
minimumNodes := DefaultRQLiteNodeCount
if minimumNodes < 3 {
t.Errorf("Minimum node count = %d, want at least 3 for fault tolerance", minimumNodes)
}
}
func TestClusterManager_QuorumCalculation(t *testing.T) {
// For RQLite Raft consensus, quorum = (n/2) + 1
tests := []struct {
nodes int
expectedQuorum int
canLoseNodes int
}{
{3, 2, 1}, // 3 nodes: quorum=2, can lose 1
{5, 3, 2}, // 5 nodes: quorum=3, can lose 2
{7, 4, 3}, // 7 nodes: quorum=4, can lose 3
}
for _, tt := range tests {
t.Run(string(rune(tt.nodes+'0'))+" nodes", func(t *testing.T) {
quorum := (tt.nodes / 2) + 1
if quorum != tt.expectedQuorum {
t.Errorf("Quorum for %d nodes = %d, want %d", tt.nodes, quorum, tt.expectedQuorum)
}
canLose := tt.nodes - quorum
if canLose != tt.canLoseNodes {
t.Errorf("Can lose %d nodes, want %d", canLose, tt.canLoseNodes)
}
})
}
}

View File

@ -0,0 +1,251 @@
package namespace
import (
"context"
"fmt"
"time"
"github.com/DeBrosOfficial/network/pkg/client"
"github.com/DeBrosOfficial/network/pkg/rqlite"
"github.com/google/uuid"
"go.uber.org/zap"
)
// DNSRecordManager manages DNS records for namespace clusters.
// It creates and deletes DNS A records for namespace gateway endpoints.
type DNSRecordManager struct {
db rqlite.Client
baseDomain string
logger *zap.Logger
}
// NewDNSRecordManager creates a new DNS record manager
func NewDNSRecordManager(db rqlite.Client, baseDomain string, logger *zap.Logger) *DNSRecordManager {
return &DNSRecordManager{
db: db,
baseDomain: baseDomain,
logger: logger.With(zap.String("component", "dns-record-manager")),
}
}
// CreateNamespaceRecords creates DNS A records for a namespace cluster.
// Each namespace gets records for ns-{namespace}.{baseDomain} pointing to its gateway nodes.
// Multiple A records enable round-robin DNS load balancing.
func (drm *DNSRecordManager) CreateNamespaceRecords(ctx context.Context, namespaceName string, nodeIPs []string) error {
internalCtx := client.WithInternalAuth(ctx)
if len(nodeIPs) == 0 {
return &ClusterError{Message: "no node IPs provided for DNS records"}
}
// FQDN for namespace gateway: ns-{namespace}.{baseDomain}.
fqdn := fmt.Sprintf("ns-%s.%s.", namespaceName, drm.baseDomain)
drm.logger.Info("Creating namespace DNS records",
zap.String("namespace", namespaceName),
zap.String("fqdn", fqdn),
zap.Strings("node_ips", nodeIPs),
)
// First, delete any existing records for this namespace
deleteQuery := `DELETE FROM dns_records WHERE fqdn = ? AND namespace = ?`
_, err := drm.db.Exec(internalCtx, deleteQuery, fqdn, "namespace:"+namespaceName)
if err != nil {
drm.logger.Warn("Failed to delete existing DNS records", zap.Error(err))
// Continue anyway - the insert will just add more records
}
// Create A records for each node IP
for _, ip := range nodeIPs {
recordID := uuid.New().String()
insertQuery := `
INSERT INTO dns_records (
id, fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`
now := time.Now()
_, err := drm.db.Exec(internalCtx, insertQuery,
recordID,
fqdn,
"A",
ip,
60, // 60 second TTL for quick failover
"namespace:"+namespaceName, // Track ownership with namespace prefix
"cluster-manager", // Created by the cluster manager
true, // Active
now,
now,
)
if err != nil {
return &ClusterError{
Message: fmt.Sprintf("failed to create DNS record for %s -> %s", fqdn, ip),
Cause: err,
}
}
}
// Also create wildcard records for deployments under this namespace
// *.ns-{namespace}.{baseDomain} -> same IPs
wildcardFqdn := fmt.Sprintf("*.ns-%s.%s.", namespaceName, drm.baseDomain)
// Delete existing wildcard records
_, _ = drm.db.Exec(internalCtx, deleteQuery, wildcardFqdn, "namespace:"+namespaceName)
for _, ip := range nodeIPs {
recordID := uuid.New().String()
insertQuery := `
INSERT INTO dns_records (
id, fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`
now := time.Now()
_, err := drm.db.Exec(internalCtx, insertQuery,
recordID,
wildcardFqdn,
"A",
ip,
60,
"namespace:"+namespaceName,
"cluster-manager",
true,
now,
now,
)
if err != nil {
drm.logger.Warn("Failed to create wildcard DNS record",
zap.String("fqdn", wildcardFqdn),
zap.String("ip", ip),
zap.Error(err),
)
// Continue - wildcard is nice to have but not critical
}
}
drm.logger.Info("Namespace DNS records created",
zap.String("namespace", namespaceName),
zap.Int("record_count", len(nodeIPs)*2), // A + wildcard
)
return nil
}
// DeleteNamespaceRecords deletes all DNS records for a namespace
func (drm *DNSRecordManager) DeleteNamespaceRecords(ctx context.Context, namespaceName string) error {
internalCtx := client.WithInternalAuth(ctx)
drm.logger.Info("Deleting namespace DNS records",
zap.String("namespace", namespaceName),
)
// Delete all records owned by this namespace
deleteQuery := `DELETE FROM dns_records WHERE namespace = ?`
_, err := drm.db.Exec(internalCtx, deleteQuery, "namespace:"+namespaceName)
if err != nil {
return &ClusterError{
Message: "failed to delete namespace DNS records",
Cause: err,
}
}
drm.logger.Info("Namespace DNS records deleted",
zap.String("namespace", namespaceName),
)
return nil
}
// GetNamespaceGatewayIPs returns the IP addresses for a namespace's gateway
func (drm *DNSRecordManager) GetNamespaceGatewayIPs(ctx context.Context, namespaceName string) ([]string, error) {
internalCtx := client.WithInternalAuth(ctx)
fqdn := fmt.Sprintf("ns-%s.%s.", namespaceName, drm.baseDomain)
type recordRow struct {
Value string `db:"value"`
}
var records []recordRow
query := `SELECT value FROM dns_records WHERE fqdn = ? AND record_type = 'A' AND is_active = TRUE`
err := drm.db.Query(internalCtx, &records, query, fqdn)
if err != nil {
return nil, &ClusterError{
Message: "failed to query namespace DNS records",
Cause: err,
}
}
ips := make([]string, len(records))
for i, r := range records {
ips[i] = r.Value
}
return ips, nil
}
// UpdateNamespaceRecord updates a specific node's DNS record (for failover)
func (drm *DNSRecordManager) UpdateNamespaceRecord(ctx context.Context, namespaceName, oldIP, newIP string) error {
internalCtx := client.WithInternalAuth(ctx)
fqdn := fmt.Sprintf("ns-%s.%s.", namespaceName, drm.baseDomain)
wildcardFqdn := fmt.Sprintf("*.ns-%s.%s.", namespaceName, drm.baseDomain)
drm.logger.Info("Updating namespace DNS record",
zap.String("namespace", namespaceName),
zap.String("old_ip", oldIP),
zap.String("new_ip", newIP),
)
// Update both the main record and wildcard record
for _, f := range []string{fqdn, wildcardFqdn} {
updateQuery := `UPDATE dns_records SET value = ?, updated_at = ? WHERE fqdn = ? AND value = ?`
_, err := drm.db.Exec(internalCtx, updateQuery, newIP, time.Now(), f, oldIP)
if err != nil {
drm.logger.Warn("Failed to update DNS record",
zap.String("fqdn", f),
zap.Error(err),
)
}
}
return nil
}
// DisableNamespaceRecord marks a specific IP's record as inactive (for temporary failover)
func (drm *DNSRecordManager) DisableNamespaceRecord(ctx context.Context, namespaceName, ip string) error {
internalCtx := client.WithInternalAuth(ctx)
fqdn := fmt.Sprintf("ns-%s.%s.", namespaceName, drm.baseDomain)
wildcardFqdn := fmt.Sprintf("*.ns-%s.%s.", namespaceName, drm.baseDomain)
drm.logger.Info("Disabling namespace DNS record",
zap.String("namespace", namespaceName),
zap.String("ip", ip),
)
for _, f := range []string{fqdn, wildcardFqdn} {
updateQuery := `UPDATE dns_records SET is_active = FALSE, updated_at = ? WHERE fqdn = ? AND value = ?`
_, _ = drm.db.Exec(internalCtx, updateQuery, time.Now(), f, ip)
}
return nil
}
// EnableNamespaceRecord marks a specific IP's record as active (for recovery)
func (drm *DNSRecordManager) EnableNamespaceRecord(ctx context.Context, namespaceName, ip string) error {
internalCtx := client.WithInternalAuth(ctx)
fqdn := fmt.Sprintf("ns-%s.%s.", namespaceName, drm.baseDomain)
wildcardFqdn := fmt.Sprintf("*.ns-%s.%s.", namespaceName, drm.baseDomain)
drm.logger.Info("Enabling namespace DNS record",
zap.String("namespace", namespaceName),
zap.String("ip", ip),
)
for _, f := range []string{fqdn, wildcardFqdn} {
updateQuery := `UPDATE dns_records SET is_active = TRUE, updated_at = ? WHERE fqdn = ? AND value = ?`
_, _ = drm.db.Exec(internalCtx, updateQuery, time.Now(), f, ip)
}
return nil
}

View File

@ -0,0 +1,217 @@
package namespace
import (
"fmt"
"testing"
"go.uber.org/zap"
)
func TestDNSRecordManager_FQDNFormat(t *testing.T) {
// Test that FQDN is correctly formatted
tests := []struct {
namespace string
baseDomain string
expected string
}{
{"alice", "devnet-orama.network", "ns-alice.devnet-orama.network."},
{"bob", "testnet-orama.network", "ns-bob.testnet-orama.network."},
{"my-namespace", "mainnet-orama.network", "ns-my-namespace.mainnet-orama.network."},
{"test123", "example.com", "ns-test123.example.com."},
}
for _, tt := range tests {
t.Run(tt.namespace, func(t *testing.T) {
fqdn := fmt.Sprintf("ns-%s.%s.", tt.namespace, tt.baseDomain)
if fqdn != tt.expected {
t.Errorf("FQDN = %s, want %s", fqdn, tt.expected)
}
})
}
}
func TestDNSRecordManager_WildcardFQDNFormat(t *testing.T) {
// Test that wildcard FQDN is correctly formatted
tests := []struct {
namespace string
baseDomain string
expected string
}{
{"alice", "devnet-orama.network", "*.ns-alice.devnet-orama.network."},
{"bob", "testnet-orama.network", "*.ns-bob.testnet-orama.network."},
}
for _, tt := range tests {
t.Run(tt.namespace, func(t *testing.T) {
wildcardFqdn := fmt.Sprintf("*.ns-%s.%s.", tt.namespace, tt.baseDomain)
if wildcardFqdn != tt.expected {
t.Errorf("Wildcard FQDN = %s, want %s", wildcardFqdn, tt.expected)
}
})
}
}
func TestNewDNSRecordManager(t *testing.T) {
mockDB := newMockRQLiteClient()
logger := zap.NewNop()
baseDomain := "devnet-orama.network"
manager := NewDNSRecordManager(mockDB, baseDomain, logger)
if manager == nil {
t.Fatal("NewDNSRecordManager returned nil")
}
}
func TestDNSRecordManager_NamespacePrefix(t *testing.T) {
// Test the namespace prefix used for tracking ownership
namespace := "my-namespace"
expected := "namespace:my-namespace"
prefix := "namespace:" + namespace
if prefix != expected {
t.Errorf("Namespace prefix = %s, want %s", prefix, expected)
}
}
func TestDNSRecordTTL(t *testing.T) {
// DNS records should have a 60-second TTL for quick failover
expectedTTL := 60
// This is testing the constant used in the code
ttl := 60
if ttl != expectedTTL {
t.Errorf("TTL = %d, want %d", ttl, expectedTTL)
}
}
func TestDNSRecordManager_MultipleDomainFormats(t *testing.T) {
// Test support for different domain formats
baseDomains := []string{
"devnet-orama.network",
"testnet-orama.network",
"mainnet-orama.network",
"custom.example.com",
"subdomain.custom.example.com",
}
for _, baseDomain := range baseDomains {
t.Run(baseDomain, func(t *testing.T) {
namespace := "test"
fqdn := fmt.Sprintf("ns-%s.%s.", namespace, baseDomain)
// Verify FQDN ends with trailing dot
if fqdn[len(fqdn)-1] != '.' {
t.Errorf("FQDN should end with trailing dot: %s", fqdn)
}
// Verify format is correct
expectedPrefix := "ns-test."
if len(fqdn) <= len(expectedPrefix) {
t.Errorf("FQDN too short: %s", fqdn)
}
if fqdn[:len(expectedPrefix)] != expectedPrefix {
t.Errorf("FQDN should start with %s: %s", expectedPrefix, fqdn)
}
})
}
}
func TestDNSRecordManager_IPValidation(t *testing.T) {
// Test IP address formats that should be accepted
validIPs := []string{
"192.168.1.1",
"10.0.0.1",
"172.16.0.1",
"1.2.3.4",
"255.255.255.255",
}
for _, ip := range validIPs {
t.Run(ip, func(t *testing.T) {
// Basic validation: IP should not be empty
if ip == "" {
t.Error("IP should not be empty")
}
})
}
}
func TestDNSRecordManager_EmptyNodeIPs(t *testing.T) {
// Creating records with empty node IPs should be an error
nodeIPs := []string{}
if len(nodeIPs) == 0 {
// This condition should trigger the error in CreateNamespaceRecords
err := &ClusterError{Message: "no node IPs provided for DNS records"}
if err.Message != "no node IPs provided for DNS records" {
t.Error("Expected error message for empty IPs")
}
}
}
func TestDNSRecordManager_RecordTypes(t *testing.T) {
// DNS records for namespace gateways should be A records
expectedRecordType := "A"
recordType := "A"
if recordType != expectedRecordType {
t.Errorf("Record type = %s, want %s", recordType, expectedRecordType)
}
}
func TestDNSRecordManager_CreatedByField(t *testing.T) {
// Records should be created by "cluster-manager"
expected := "cluster-manager"
createdBy := "cluster-manager"
if createdBy != expected {
t.Errorf("CreatedBy = %s, want %s", createdBy, expected)
}
}
func TestDNSRecordManager_RoundRobinConcept(t *testing.T) {
// Test that multiple A records for the same FQDN enable round-robin
nodeIPs := []string{
"192.168.1.100",
"192.168.1.101",
"192.168.1.102",
}
// For round-robin DNS, we need one A record per IP
expectedRecordCount := len(nodeIPs)
if expectedRecordCount != 3 {
t.Errorf("Expected %d A records for round-robin, got %d", 3, expectedRecordCount)
}
// Each IP should be unique
seen := make(map[string]bool)
for _, ip := range nodeIPs {
if seen[ip] {
t.Errorf("Duplicate IP in node list: %s", ip)
}
seen[ip] = true
}
}
func TestDNSRecordManager_FQDNWithTrailingDot(t *testing.T) {
// DNS FQDNs should always end with a trailing dot
// This is important for proper DNS resolution
tests := []struct {
input string
expected string
}{
{"ns-alice.devnet-orama.network", "ns-alice.devnet-orama.network."},
}
for _, tt := range tests {
t.Run(tt.input, func(t *testing.T) {
fqdn := tt.input + "."
if fqdn != tt.expected {
t.Errorf("FQDN = %s, want %s", fqdn, tt.expected)
}
})
}
}

View File

@ -0,0 +1,382 @@
package namespace
import (
"context"
"sort"
"time"
"github.com/DeBrosOfficial/network/pkg/client"
"github.com/DeBrosOfficial/network/pkg/rqlite"
"go.uber.org/zap"
)
// ClusterNodeSelector selects optimal nodes for namespace clusters.
// It extends the existing capacity scoring system from deployments/home_node.go
// to select multiple nodes based on available capacity.
type ClusterNodeSelector struct {
db rqlite.Client
portAllocator *NamespacePortAllocator
logger *zap.Logger
}
// NodeCapacity represents the capacity metrics for a single node
type NodeCapacity struct {
NodeID string `json:"node_id"`
IPAddress string `json:"ip_address"`
DeploymentCount int `json:"deployment_count"`
AllocatedPorts int `json:"allocated_ports"`
AvailablePorts int `json:"available_ports"`
UsedMemoryMB int `json:"used_memory_mb"`
AvailableMemoryMB int `json:"available_memory_mb"`
UsedCPUPercent int `json:"used_cpu_percent"`
NamespaceInstanceCount int `json:"namespace_instance_count"` // Number of namespace clusters on this node
AvailableNamespaceSlots int `json:"available_namespace_slots"` // How many more namespace instances can fit
Score float64 `json:"score"`
}
// NewClusterNodeSelector creates a new node selector
func NewClusterNodeSelector(db rqlite.Client, portAllocator *NamespacePortAllocator, logger *zap.Logger) *ClusterNodeSelector {
return &ClusterNodeSelector{
db: db,
portAllocator: portAllocator,
logger: logger.With(zap.String("component", "cluster-node-selector")),
}
}
// SelectNodesForCluster selects the optimal N nodes for a new namespace cluster.
// Returns the node IDs sorted by score (best first).
func (cns *ClusterNodeSelector) SelectNodesForCluster(ctx context.Context, nodeCount int) ([]NodeCapacity, error) {
internalCtx := client.WithInternalAuth(ctx)
// Get all active nodes
activeNodes, err := cns.getActiveNodes(internalCtx)
if err != nil {
return nil, err
}
cns.logger.Debug("Found active nodes", zap.Int("count", len(activeNodes)))
// Filter nodes that have capacity for namespace instances
eligibleNodes := make([]NodeCapacity, 0)
for _, node := range activeNodes {
capacity, err := cns.getNodeCapacity(internalCtx, node.NodeID, node.IPAddress)
if err != nil {
cns.logger.Warn("Failed to get node capacity, skipping",
zap.String("node_id", node.NodeID),
zap.Error(err),
)
continue
}
// Only include nodes with available namespace slots
if capacity.AvailableNamespaceSlots > 0 {
eligibleNodes = append(eligibleNodes, *capacity)
} else {
cns.logger.Debug("Node at capacity, skipping",
zap.String("node_id", node.NodeID),
zap.Int("namespace_instances", capacity.NamespaceInstanceCount),
)
}
}
cns.logger.Debug("Eligible nodes after filtering", zap.Int("count", len(eligibleNodes)))
// Check if we have enough nodes
if len(eligibleNodes) < nodeCount {
return nil, &ClusterError{
Message: ErrInsufficientNodes.Message,
Cause: nil,
}
}
// Sort by score (highest first)
sort.Slice(eligibleNodes, func(i, j int) bool {
return eligibleNodes[i].Score > eligibleNodes[j].Score
})
// Return top N nodes
selectedNodes := eligibleNodes[:nodeCount]
cns.logger.Info("Selected nodes for cluster",
zap.Int("requested", nodeCount),
zap.Int("selected", len(selectedNodes)),
)
for i, node := range selectedNodes {
cns.logger.Debug("Selected node",
zap.Int("rank", i+1),
zap.String("node_id", node.NodeID),
zap.Float64("score", node.Score),
zap.Int("namespace_instances", node.NamespaceInstanceCount),
zap.Int("available_slots", node.AvailableNamespaceSlots),
)
}
return selectedNodes, nil
}
// nodeInfo is used for querying active nodes
type nodeInfo struct {
NodeID string `db:"id"`
IPAddress string `db:"ip_address"`
}
// getActiveNodes retrieves all active nodes from dns_nodes table
func (cns *ClusterNodeSelector) getActiveNodes(ctx context.Context) ([]nodeInfo, error) {
// Nodes must have checked in within last 2 minutes
cutoff := time.Now().Add(-2 * time.Minute)
var results []nodeInfo
query := `
SELECT id, ip_address FROM dns_nodes
WHERE status = 'active' AND last_seen > ?
ORDER BY id
`
err := cns.db.Query(ctx, &results, query, cutoff.Format("2006-01-02 15:04:05"))
if err != nil {
return nil, &ClusterError{
Message: "failed to query active nodes",
Cause: err,
}
}
cns.logger.Debug("Found active nodes",
zap.Int("count", len(results)),
)
return results, nil
}
// getNodeCapacity calculates capacity metrics for a single node
func (cns *ClusterNodeSelector) getNodeCapacity(ctx context.Context, nodeID, ipAddress string) (*NodeCapacity, error) {
// Get deployment count
deploymentCount, err := cns.getDeploymentCount(ctx, nodeID)
if err != nil {
return nil, err
}
// Get allocated deployment ports
allocatedPorts, err := cns.getDeploymentPortCount(ctx, nodeID)
if err != nil {
return nil, err
}
// Get resource usage from home_node_assignments
totalMemoryMB, totalCPUPercent, err := cns.getNodeResourceUsage(ctx, nodeID)
if err != nil {
return nil, err
}
// Get namespace instance count
namespaceInstanceCount, err := cns.portAllocator.GetNodeAllocationCount(ctx, nodeID)
if err != nil {
return nil, err
}
// Calculate available capacity
const (
maxDeployments = 100
maxPorts = 9900 // User deployment port range
maxMemoryMB = 8192 // 8GB
maxCPUPercent = 400 // 4 cores
)
availablePorts := maxPorts - allocatedPorts
if availablePorts < 0 {
availablePorts = 0
}
availableMemoryMB := maxMemoryMB - totalMemoryMB
if availableMemoryMB < 0 {
availableMemoryMB = 0
}
availableNamespaceSlots := MaxNamespacesPerNode - namespaceInstanceCount
if availableNamespaceSlots < 0 {
availableNamespaceSlots = 0
}
// Calculate capacity score (0.0 to 1.0, higher is better)
// Extended from home_node.go to include namespace instance count
score := cns.calculateCapacityScore(
deploymentCount, maxDeployments,
allocatedPorts, maxPorts,
totalMemoryMB, maxMemoryMB,
totalCPUPercent, maxCPUPercent,
namespaceInstanceCount, MaxNamespacesPerNode,
)
capacity := &NodeCapacity{
NodeID: nodeID,
IPAddress: ipAddress,
DeploymentCount: deploymentCount,
AllocatedPorts: allocatedPorts,
AvailablePorts: availablePorts,
UsedMemoryMB: totalMemoryMB,
AvailableMemoryMB: availableMemoryMB,
UsedCPUPercent: totalCPUPercent,
NamespaceInstanceCount: namespaceInstanceCount,
AvailableNamespaceSlots: availableNamespaceSlots,
Score: score,
}
return capacity, nil
}
// getDeploymentCount counts active deployments on a node
func (cns *ClusterNodeSelector) getDeploymentCount(ctx context.Context, nodeID string) (int, error) {
type countResult struct {
Count int `db:"count"`
}
var results []countResult
query := `SELECT COUNT(*) as count FROM deployments WHERE home_node_id = ? AND status IN ('active', 'deploying')`
err := cns.db.Query(ctx, &results, query, nodeID)
if err != nil {
return 0, &ClusterError{
Message: "failed to count deployments",
Cause: err,
}
}
if len(results) == 0 {
return 0, nil
}
return results[0].Count, nil
}
// getDeploymentPortCount counts allocated deployment ports on a node
func (cns *ClusterNodeSelector) getDeploymentPortCount(ctx context.Context, nodeID string) (int, error) {
type countResult struct {
Count int `db:"count"`
}
var results []countResult
query := `SELECT COUNT(*) as count FROM port_allocations WHERE node_id = ?`
err := cns.db.Query(ctx, &results, query, nodeID)
if err != nil {
return 0, &ClusterError{
Message: "failed to count allocated ports",
Cause: err,
}
}
if len(results) == 0 {
return 0, nil
}
return results[0].Count, nil
}
// getNodeResourceUsage sums up resource usage for all namespaces on a node
func (cns *ClusterNodeSelector) getNodeResourceUsage(ctx context.Context, nodeID string) (int, int, error) {
type resourceResult struct {
TotalMemoryMB int `db:"total_memory"`
TotalCPUPercent int `db:"total_cpu"`
}
var results []resourceResult
query := `
SELECT
COALESCE(SUM(total_memory_mb), 0) as total_memory,
COALESCE(SUM(total_cpu_percent), 0) as total_cpu
FROM home_node_assignments
WHERE home_node_id = ?
`
err := cns.db.Query(ctx, &results, query, nodeID)
if err != nil {
return 0, 0, &ClusterError{
Message: "failed to query resource usage",
Cause: err,
}
}
if len(results) == 0 {
return 0, 0, nil
}
return results[0].TotalMemoryMB, results[0].TotalCPUPercent, nil
}
// calculateCapacityScore calculates a weighted capacity score (0.0 to 1.0)
// Higher scores indicate more available capacity
func (cns *ClusterNodeSelector) calculateCapacityScore(
deploymentCount, maxDeployments int,
allocatedPorts, maxPorts int,
usedMemoryMB, maxMemoryMB int,
usedCPUPercent, maxCPUPercent int,
namespaceInstances, maxNamespaceInstances int,
) float64 {
// Calculate individual component scores (0.0 to 1.0)
deploymentScore := 1.0 - (float64(deploymentCount) / float64(maxDeployments))
if deploymentScore < 0 {
deploymentScore = 0
}
portScore := 1.0 - (float64(allocatedPorts) / float64(maxPorts))
if portScore < 0 {
portScore = 0
}
memoryScore := 1.0 - (float64(usedMemoryMB) / float64(maxMemoryMB))
if memoryScore < 0 {
memoryScore = 0
}
cpuScore := 1.0 - (float64(usedCPUPercent) / float64(maxCPUPercent))
if cpuScore < 0 {
cpuScore = 0
}
namespaceScore := 1.0 - (float64(namespaceInstances) / float64(maxNamespaceInstances))
if namespaceScore < 0 {
namespaceScore = 0
}
// Weighted average
// Namespace instance count gets significant weight since that's what we're optimizing for
// Weights: deployments 30%, ports 15%, memory 15%, cpu 15%, namespace instances 25%
totalScore := (deploymentScore * 0.30) +
(portScore * 0.15) +
(memoryScore * 0.15) +
(cpuScore * 0.15) +
(namespaceScore * 0.25)
cns.logger.Debug("Calculated capacity score",
zap.Int("deployments", deploymentCount),
zap.Int("allocated_ports", allocatedPorts),
zap.Int("used_memory_mb", usedMemoryMB),
zap.Int("used_cpu_percent", usedCPUPercent),
zap.Int("namespace_instances", namespaceInstances),
zap.Float64("deployment_score", deploymentScore),
zap.Float64("port_score", portScore),
zap.Float64("memory_score", memoryScore),
zap.Float64("cpu_score", cpuScore),
zap.Float64("namespace_score", namespaceScore),
zap.Float64("total_score", totalScore),
)
return totalScore
}
// GetNodeByID retrieves a node's information by ID
func (cns *ClusterNodeSelector) GetNodeByID(ctx context.Context, nodeID string) (*nodeInfo, error) {
internalCtx := client.WithInternalAuth(ctx)
var results []nodeInfo
query := `SELECT id, ip_address FROM dns_nodes WHERE id = ? LIMIT 1`
err := cns.db.Query(internalCtx, &results, query, nodeID)
if err != nil {
return nil, &ClusterError{
Message: "failed to query node",
Cause: err,
}
}
if len(results) == 0 {
return nil, nil
}
return &results[0], nil
}

View File

@ -0,0 +1,227 @@
package namespace
import (
"testing"
"go.uber.org/zap"
)
func TestCalculateCapacityScore_EmptyNode(t *testing.T) {
logger := zap.NewNop()
mockDB := newMockRQLiteClient()
portAllocator := NewNamespacePortAllocator(mockDB, logger)
selector := NewClusterNodeSelector(mockDB, portAllocator, logger)
// Empty node should have score of 1.0 (100% available)
score := selector.calculateCapacityScore(
0, 100, // deployments
0, 9900, // ports
0, 8192, // memory
0, 400, // cpu
0, 20, // namespace instances
)
if score != 1.0 {
t.Errorf("Empty node score = %f, want 1.0", score)
}
}
func TestCalculateCapacityScore_FullNode(t *testing.T) {
logger := zap.NewNop()
mockDB := newMockRQLiteClient()
portAllocator := NewNamespacePortAllocator(mockDB, logger)
selector := NewClusterNodeSelector(mockDB, portAllocator, logger)
// Full node should have score of 0.0 (0% available)
score := selector.calculateCapacityScore(
100, 100, // deployments (full)
9900, 9900, // ports (full)
8192, 8192, // memory (full)
400, 400, // cpu (full)
20, 20, // namespace instances (full)
)
if score != 0.0 {
t.Errorf("Full node score = %f, want 0.0", score)
}
}
func TestCalculateCapacityScore_HalfCapacity(t *testing.T) {
logger := zap.NewNop()
mockDB := newMockRQLiteClient()
portAllocator := NewNamespacePortAllocator(mockDB, logger)
selector := NewClusterNodeSelector(mockDB, portAllocator, logger)
// Half-full node should have score of approximately 0.5
score := selector.calculateCapacityScore(
50, 100, // 50% deployments
4950, 9900, // 50% ports
4096, 8192, // 50% memory
200, 400, // 50% cpu
10, 20, // 50% namespace instances
)
// With all components at 50%, the weighted average should be 0.5
expected := 0.5
tolerance := 0.01
if score < expected-tolerance || score > expected+tolerance {
t.Errorf("Half capacity score = %f, want approximately %f", score, expected)
}
}
func TestCalculateCapacityScore_Weights(t *testing.T) {
logger := zap.NewNop()
mockDB := newMockRQLiteClient()
portAllocator := NewNamespacePortAllocator(mockDB, logger)
selector := NewClusterNodeSelector(mockDB, portAllocator, logger)
// Test that deployment weight is 30%, namespace instance weight is 25%
// Only deployments full (other metrics empty)
deploymentOnlyScore := selector.calculateCapacityScore(
100, 100, // deployments full (contributes 0 * 0.30 = 0)
0, 9900, // ports empty (contributes 1.0 * 0.15 = 0.15)
0, 8192, // memory empty (contributes 1.0 * 0.15 = 0.15)
0, 400, // cpu empty (contributes 1.0 * 0.15 = 0.15)
0, 20, // namespace instances empty (contributes 1.0 * 0.25 = 0.25)
)
// Expected: 0 + 0.15 + 0.15 + 0.15 + 0.25 = 0.70
expectedDeploymentOnly := 0.70
tolerance := 0.01
if deploymentOnlyScore < expectedDeploymentOnly-tolerance || deploymentOnlyScore > expectedDeploymentOnly+tolerance {
t.Errorf("Deployment-only-full score = %f, want %f", deploymentOnlyScore, expectedDeploymentOnly)
}
// Only namespace instances full (other metrics empty)
namespaceOnlyScore := selector.calculateCapacityScore(
0, 100, // deployments empty (contributes 1.0 * 0.30 = 0.30)
0, 9900, // ports empty (contributes 1.0 * 0.15 = 0.15)
0, 8192, // memory empty (contributes 1.0 * 0.15 = 0.15)
0, 400, // cpu empty (contributes 1.0 * 0.15 = 0.15)
20, 20, // namespace instances full (contributes 0 * 0.25 = 0)
)
// Expected: 0.30 + 0.15 + 0.15 + 0.15 + 0 = 0.75
expectedNamespaceOnly := 0.75
if namespaceOnlyScore < expectedNamespaceOnly-tolerance || namespaceOnlyScore > expectedNamespaceOnly+tolerance {
t.Errorf("Namespace-only-full score = %f, want %f", namespaceOnlyScore, expectedNamespaceOnly)
}
}
func TestCalculateCapacityScore_NegativeValues(t *testing.T) {
logger := zap.NewNop()
mockDB := newMockRQLiteClient()
portAllocator := NewNamespacePortAllocator(mockDB, logger)
selector := NewClusterNodeSelector(mockDB, portAllocator, logger)
// Test that over-capacity values (which would produce negative scores) are clamped to 0
score := selector.calculateCapacityScore(
200, 100, // 200% deployments (should clamp to 0)
20000, 9900, // over ports (should clamp to 0)
16000, 8192, // over memory (should clamp to 0)
800, 400, // over cpu (should clamp to 0)
40, 20, // over namespace instances (should clamp to 0)
)
if score != 0.0 {
t.Errorf("Over-capacity score = %f, want 0.0", score)
}
}
func TestNodeCapacity_AvailableSlots(t *testing.T) {
tests := []struct {
name string
instanceCount int
expectedAvailable int
}{
{"Empty node", 0, 20},
{"One instance", 1, 19},
{"Half full", 10, 10},
{"Almost full", 19, 1},
{"Full", 20, 0},
{"Over capacity", 25, 0}, // Should clamp to 0
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
available := MaxNamespacesPerNode - tt.instanceCount
if available < 0 {
available = 0
}
if available != tt.expectedAvailable {
t.Errorf("Available slots for %d instances = %d, want %d",
tt.instanceCount, available, tt.expectedAvailable)
}
})
}
}
func TestNewClusterNodeSelector(t *testing.T) {
logger := zap.NewNop()
mockDB := newMockRQLiteClient()
portAllocator := NewNamespacePortAllocator(mockDB, logger)
selector := NewClusterNodeSelector(mockDB, portAllocator, logger)
if selector == nil {
t.Fatal("NewClusterNodeSelector returned nil")
}
}
func TestNodeCapacityStruct(t *testing.T) {
// Test NodeCapacity struct initialization
capacity := NodeCapacity{
NodeID: "node-123",
IPAddress: "192.168.1.100",
DeploymentCount: 10,
AllocatedPorts: 50,
AvailablePorts: 9850,
UsedMemoryMB: 2048,
AvailableMemoryMB: 6144,
UsedCPUPercent: 100,
NamespaceInstanceCount: 5,
AvailableNamespaceSlots: 15,
Score: 0.75,
}
if capacity.NodeID != "node-123" {
t.Errorf("NodeID = %s, want node-123", capacity.NodeID)
}
if capacity.AvailableNamespaceSlots != 15 {
t.Errorf("AvailableNamespaceSlots = %d, want 15", capacity.AvailableNamespaceSlots)
}
if capacity.Score != 0.75 {
t.Errorf("Score = %f, want 0.75", capacity.Score)
}
}
func TestScoreRanking(t *testing.T) {
// Test that higher scores indicate more available capacity
logger := zap.NewNop()
mockDB := newMockRQLiteClient()
portAllocator := NewNamespacePortAllocator(mockDB, logger)
selector := NewClusterNodeSelector(mockDB, portAllocator, logger)
// Node A: Light load
scoreA := selector.calculateCapacityScore(
10, 100, // 10% deployments
500, 9900, // ~5% ports
1000, 8192,// ~12% memory
50, 400, // ~12% cpu
2, 20, // 10% namespace instances
)
// Node B: Heavy load
scoreB := selector.calculateCapacityScore(
80, 100, // 80% deployments
8000, 9900, // ~80% ports
7000, 8192, // ~85% memory
350, 400, // ~87% cpu
18, 20, // 90% namespace instances
)
if scoreA <= scoreB {
t.Errorf("Light load score (%f) should be higher than heavy load score (%f)", scoreA, scoreB)
}
}

View File

@ -0,0 +1,341 @@
package namespace
import (
"context"
"fmt"
"time"
"github.com/DeBrosOfficial/network/pkg/client"
"github.com/DeBrosOfficial/network/pkg/rqlite"
"github.com/google/uuid"
"go.uber.org/zap"
)
// NamespacePortAllocator manages the reserved port range (10000-10099) for namespace services.
// Each namespace instance on a node gets a block of 5 consecutive ports.
type NamespacePortAllocator struct {
db rqlite.Client
logger *zap.Logger
}
// NewNamespacePortAllocator creates a new port allocator
func NewNamespacePortAllocator(db rqlite.Client, logger *zap.Logger) *NamespacePortAllocator {
return &NamespacePortAllocator{
db: db,
logger: logger.With(zap.String("component", "namespace-port-allocator")),
}
}
// AllocatePortBlock finds and allocates the next available 5-port block on a node.
// Returns an error if the node is at capacity (20 namespace instances).
func (npa *NamespacePortAllocator) AllocatePortBlock(ctx context.Context, nodeID, namespaceClusterID string) (*PortBlock, error) {
internalCtx := client.WithInternalAuth(ctx)
// Check if allocation already exists for this namespace on this node
existingBlock, err := npa.GetPortBlock(ctx, namespaceClusterID, nodeID)
if err == nil && existingBlock != nil {
npa.logger.Debug("Port block already allocated",
zap.String("node_id", nodeID),
zap.String("namespace_cluster_id", namespaceClusterID),
zap.Int("port_start", existingBlock.PortStart),
)
return existingBlock, nil
}
// Retry logic for handling concurrent allocation conflicts
maxRetries := 10
retryDelay := 100 * time.Millisecond
for attempt := 0; attempt < maxRetries; attempt++ {
block, err := npa.tryAllocatePortBlock(internalCtx, nodeID, namespaceClusterID)
if err == nil {
npa.logger.Info("Port block allocated successfully",
zap.String("node_id", nodeID),
zap.String("namespace_cluster_id", namespaceClusterID),
zap.Int("port_start", block.PortStart),
zap.Int("attempt", attempt+1),
)
return block, nil
}
// If it's a conflict error, retry with exponential backoff
if isConflictError(err) {
npa.logger.Debug("Port allocation conflict, retrying",
zap.String("node_id", nodeID),
zap.String("namespace_cluster_id", namespaceClusterID),
zap.Int("attempt", attempt+1),
zap.Error(err),
)
time.Sleep(retryDelay)
retryDelay *= 2
continue
}
// Other errors are non-retryable
return nil, err
}
return nil, &ClusterError{
Message: fmt.Sprintf("failed to allocate port block after %d retries", maxRetries),
}
}
// tryAllocatePortBlock attempts to allocate a port block (single attempt)
func (npa *NamespacePortAllocator) tryAllocatePortBlock(ctx context.Context, nodeID, namespaceClusterID string) (*PortBlock, error) {
// Query all allocated port blocks on this node
type portRow struct {
PortStart int `db:"port_start"`
}
var allocatedBlocks []portRow
query := `SELECT port_start FROM namespace_port_allocations WHERE node_id = ? ORDER BY port_start ASC`
err := npa.db.Query(ctx, &allocatedBlocks, query, nodeID)
if err != nil {
return nil, &ClusterError{
Message: "failed to query allocated ports",
Cause: err,
}
}
// Build map of allocated block starts
allocatedStarts := make(map[int]bool)
for _, row := range allocatedBlocks {
allocatedStarts[row.PortStart] = true
}
// Check node capacity
if len(allocatedBlocks) >= MaxNamespacesPerNode {
return nil, ErrNodeAtCapacity
}
// Find first available port block
portStart := -1
for start := NamespacePortRangeStart; start <= NamespacePortRangeEnd-PortsPerNamespace+1; start += PortsPerNamespace {
if !allocatedStarts[start] {
portStart = start
break
}
}
if portStart < 0 {
return nil, ErrNoPortsAvailable
}
// Create port block
block := &PortBlock{
ID: uuid.New().String(),
NodeID: nodeID,
NamespaceClusterID: namespaceClusterID,
PortStart: portStart,
PortEnd: portStart + PortsPerNamespace - 1,
RQLiteHTTPPort: portStart + 0,
RQLiteRaftPort: portStart + 1,
OlricHTTPPort: portStart + 2,
OlricMemberlistPort: portStart + 3,
GatewayHTTPPort: portStart + 4,
AllocatedAt: time.Now(),
}
// Attempt to insert allocation record
insertQuery := `
INSERT INTO namespace_port_allocations (
id, node_id, namespace_cluster_id, port_start, port_end,
rqlite_http_port, rqlite_raft_port, olric_http_port, olric_memberlist_port, gateway_http_port,
allocated_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`
_, err = npa.db.Exec(ctx, insertQuery,
block.ID,
block.NodeID,
block.NamespaceClusterID,
block.PortStart,
block.PortEnd,
block.RQLiteHTTPPort,
block.RQLiteRaftPort,
block.OlricHTTPPort,
block.OlricMemberlistPort,
block.GatewayHTTPPort,
block.AllocatedAt,
)
if err != nil {
return nil, &ClusterError{
Message: "failed to insert port allocation",
Cause: err,
}
}
return block, nil
}
// DeallocatePortBlock releases a port block when a namespace is deprovisioned
func (npa *NamespacePortAllocator) DeallocatePortBlock(ctx context.Context, namespaceClusterID, nodeID string) error {
internalCtx := client.WithInternalAuth(ctx)
query := `DELETE FROM namespace_port_allocations WHERE namespace_cluster_id = ? AND node_id = ?`
_, err := npa.db.Exec(internalCtx, query, namespaceClusterID, nodeID)
if err != nil {
return &ClusterError{
Message: "failed to deallocate port block",
Cause: err,
}
}
npa.logger.Info("Port block deallocated",
zap.String("namespace_cluster_id", namespaceClusterID),
zap.String("node_id", nodeID),
)
return nil
}
// DeallocateAllPortBlocks releases all port blocks for a namespace cluster
func (npa *NamespacePortAllocator) DeallocateAllPortBlocks(ctx context.Context, namespaceClusterID string) error {
internalCtx := client.WithInternalAuth(ctx)
query := `DELETE FROM namespace_port_allocations WHERE namespace_cluster_id = ?`
_, err := npa.db.Exec(internalCtx, query, namespaceClusterID)
if err != nil {
return &ClusterError{
Message: "failed to deallocate all port blocks",
Cause: err,
}
}
npa.logger.Info("All port blocks deallocated",
zap.String("namespace_cluster_id", namespaceClusterID),
)
return nil
}
// GetPortBlock retrieves the port block for a namespace on a specific node
func (npa *NamespacePortAllocator) GetPortBlock(ctx context.Context, namespaceClusterID, nodeID string) (*PortBlock, error) {
internalCtx := client.WithInternalAuth(ctx)
var blocks []PortBlock
query := `
SELECT id, node_id, namespace_cluster_id, port_start, port_end,
rqlite_http_port, rqlite_raft_port, olric_http_port, olric_memberlist_port, gateway_http_port,
allocated_at
FROM namespace_port_allocations
WHERE namespace_cluster_id = ? AND node_id = ?
LIMIT 1
`
err := npa.db.Query(internalCtx, &blocks, query, namespaceClusterID, nodeID)
if err != nil {
return nil, &ClusterError{
Message: "failed to query port block",
Cause: err,
}
}
if len(blocks) == 0 {
return nil, nil
}
return &blocks[0], nil
}
// GetAllPortBlocks retrieves all port blocks for a namespace cluster
func (npa *NamespacePortAllocator) GetAllPortBlocks(ctx context.Context, namespaceClusterID string) ([]PortBlock, error) {
internalCtx := client.WithInternalAuth(ctx)
var blocks []PortBlock
query := `
SELECT id, node_id, namespace_cluster_id, port_start, port_end,
rqlite_http_port, rqlite_raft_port, olric_http_port, olric_memberlist_port, gateway_http_port,
allocated_at
FROM namespace_port_allocations
WHERE namespace_cluster_id = ?
ORDER BY port_start ASC
`
err := npa.db.Query(internalCtx, &blocks, query, namespaceClusterID)
if err != nil {
return nil, &ClusterError{
Message: "failed to query port blocks",
Cause: err,
}
}
return blocks, nil
}
// GetNodeCapacity returns how many more namespace instances a node can host
func (npa *NamespacePortAllocator) GetNodeCapacity(ctx context.Context, nodeID string) (int, error) {
internalCtx := client.WithInternalAuth(ctx)
type countResult struct {
Count int `db:"count"`
}
var results []countResult
query := `SELECT COUNT(*) as count FROM namespace_port_allocations WHERE node_id = ?`
err := npa.db.Query(internalCtx, &results, query, nodeID)
if err != nil {
return 0, &ClusterError{
Message: "failed to count allocated port blocks",
Cause: err,
}
}
if len(results) == 0 {
return MaxNamespacesPerNode, nil
}
allocated := results[0].Count
available := MaxNamespacesPerNode - allocated
if available < 0 {
available = 0
}
return available, nil
}
// GetNodeAllocationCount returns the number of namespace instances on a node
func (npa *NamespacePortAllocator) GetNodeAllocationCount(ctx context.Context, nodeID string) (int, error) {
internalCtx := client.WithInternalAuth(ctx)
type countResult struct {
Count int `db:"count"`
}
var results []countResult
query := `SELECT COUNT(*) as count FROM namespace_port_allocations WHERE node_id = ?`
err := npa.db.Query(internalCtx, &results, query, nodeID)
if err != nil {
return 0, &ClusterError{
Message: "failed to count allocated port blocks",
Cause: err,
}
}
if len(results) == 0 {
return 0, nil
}
return results[0].Count, nil
}
// isConflictError checks if an error is due to a constraint violation
func isConflictError(err error) bool {
if err == nil {
return false
}
errStr := err.Error()
return contains(errStr, "UNIQUE") || contains(errStr, "constraint") || contains(errStr, "conflict")
}
// contains checks if a string contains a substring (case-insensitive)
func contains(s, substr string) bool {
return len(s) >= len(substr) && (s == substr || len(s) > len(substr) && findSubstring(s, substr))
}
func findSubstring(s, substr string) bool {
for i := 0; i <= len(s)-len(substr); i++ {
if s[i:i+len(substr)] == substr {
return true
}
}
return false
}

View File

@ -0,0 +1,310 @@
package namespace
import (
"context"
"database/sql"
"errors"
"testing"
"time"
"github.com/DeBrosOfficial/network/pkg/rqlite"
"go.uber.org/zap"
)
// mockResult implements sql.Result
type mockResult struct {
lastInsertID int64
rowsAffected int64
}
func (m mockResult) LastInsertId() (int64, error) { return m.lastInsertID, nil }
func (m mockResult) RowsAffected() (int64, error) { return m.rowsAffected, nil }
// mockRQLiteClient implements rqlite.Client for testing
type mockRQLiteClient struct {
queryResults map[string]interface{}
execResults map[string]error
queryCalls []mockQueryCall
execCalls []mockExecCall
}
type mockQueryCall struct {
Query string
Args []interface{}
}
type mockExecCall struct {
Query string
Args []interface{}
}
func newMockRQLiteClient() *mockRQLiteClient {
return &mockRQLiteClient{
queryResults: make(map[string]interface{}),
execResults: make(map[string]error),
queryCalls: make([]mockQueryCall, 0),
execCalls: make([]mockExecCall, 0),
}
}
func (m *mockRQLiteClient) Query(ctx context.Context, dest any, query string, args ...any) error {
ifaceArgs := make([]interface{}, len(args))
for i, a := range args {
ifaceArgs[i] = a
}
m.queryCalls = append(m.queryCalls, mockQueryCall{Query: query, Args: ifaceArgs})
return nil
}
func (m *mockRQLiteClient) Exec(ctx context.Context, query string, args ...any) (sql.Result, error) {
ifaceArgs := make([]interface{}, len(args))
for i, a := range args {
ifaceArgs[i] = a
}
m.execCalls = append(m.execCalls, mockExecCall{Query: query, Args: ifaceArgs})
if err, ok := m.execResults[query]; ok {
return nil, err
}
return mockResult{rowsAffected: 1}, nil
}
func (m *mockRQLiteClient) FindBy(ctx context.Context, dest any, table string, criteria map[string]any, opts ...rqlite.FindOption) error {
return nil
}
func (m *mockRQLiteClient) FindOneBy(ctx context.Context, dest any, table string, criteria map[string]any, opts ...rqlite.FindOption) error {
return nil
}
func (m *mockRQLiteClient) Save(ctx context.Context, entity any) error {
return nil
}
func (m *mockRQLiteClient) Remove(ctx context.Context, entity any) error {
return nil
}
func (m *mockRQLiteClient) Repository(table string) any {
return nil
}
func (m *mockRQLiteClient) CreateQueryBuilder(table string) *rqlite.QueryBuilder {
return nil
}
func (m *mockRQLiteClient) Tx(ctx context.Context, fn func(tx rqlite.Tx) error) error {
return nil
}
// Ensure mockRQLiteClient implements rqlite.Client
var _ rqlite.Client = (*mockRQLiteClient)(nil)
func TestPortBlock_PortAssignment(t *testing.T) {
// Test that port block correctly assigns ports
block := &PortBlock{
ID: "test-id",
NodeID: "node-1",
NamespaceClusterID: "cluster-1",
PortStart: 10000,
PortEnd: 10004,
RQLiteHTTPPort: 10000,
RQLiteRaftPort: 10001,
OlricHTTPPort: 10002,
OlricMemberlistPort: 10003,
GatewayHTTPPort: 10004,
AllocatedAt: time.Now(),
}
// Verify port assignments
if block.RQLiteHTTPPort != block.PortStart+0 {
t.Errorf("RQLiteHTTPPort = %d, want %d", block.RQLiteHTTPPort, block.PortStart+0)
}
if block.RQLiteRaftPort != block.PortStart+1 {
t.Errorf("RQLiteRaftPort = %d, want %d", block.RQLiteRaftPort, block.PortStart+1)
}
if block.OlricHTTPPort != block.PortStart+2 {
t.Errorf("OlricHTTPPort = %d, want %d", block.OlricHTTPPort, block.PortStart+2)
}
if block.OlricMemberlistPort != block.PortStart+3 {
t.Errorf("OlricMemberlistPort = %d, want %d", block.OlricMemberlistPort, block.PortStart+3)
}
if block.GatewayHTTPPort != block.PortStart+4 {
t.Errorf("GatewayHTTPPort = %d, want %d", block.GatewayHTTPPort, block.PortStart+4)
}
}
func TestPortConstants(t *testing.T) {
// Verify constants are correctly defined
if NamespacePortRangeStart != 10000 {
t.Errorf("NamespacePortRangeStart = %d, want 10000", NamespacePortRangeStart)
}
if NamespacePortRangeEnd != 10099 {
t.Errorf("NamespacePortRangeEnd = %d, want 10099", NamespacePortRangeEnd)
}
if PortsPerNamespace != 5 {
t.Errorf("PortsPerNamespace = %d, want 5", PortsPerNamespace)
}
// Verify max namespaces calculation: (10099 - 10000 + 1) / 5 = 100 / 5 = 20
expectedMax := (NamespacePortRangeEnd - NamespacePortRangeStart + 1) / PortsPerNamespace
if MaxNamespacesPerNode != expectedMax {
t.Errorf("MaxNamespacesPerNode = %d, want %d", MaxNamespacesPerNode, expectedMax)
}
if MaxNamespacesPerNode != 20 {
t.Errorf("MaxNamespacesPerNode = %d, want 20", MaxNamespacesPerNode)
}
}
func TestPortRangeCapacity(t *testing.T) {
// Test that 20 namespaces fit exactly in the port range
usedPorts := MaxNamespacesPerNode * PortsPerNamespace
availablePorts := NamespacePortRangeEnd - NamespacePortRangeStart + 1
if usedPorts > availablePorts {
t.Errorf("Port range overflow: %d ports needed for %d namespaces, but only %d available",
usedPorts, MaxNamespacesPerNode, availablePorts)
}
// Verify no wasted ports
if usedPorts != availablePorts {
t.Logf("Note: %d ports unused in range", availablePorts-usedPorts)
}
}
func TestPortBlockAllocation_SequentialBlocks(t *testing.T) {
// Verify that sequential port blocks don't overlap
blocks := make([]*PortBlock, MaxNamespacesPerNode)
for i := 0; i < MaxNamespacesPerNode; i++ {
portStart := NamespacePortRangeStart + (i * PortsPerNamespace)
blocks[i] = &PortBlock{
PortStart: portStart,
PortEnd: portStart + PortsPerNamespace - 1,
RQLiteHTTPPort: portStart + 0,
RQLiteRaftPort: portStart + 1,
OlricHTTPPort: portStart + 2,
OlricMemberlistPort: portStart + 3,
GatewayHTTPPort: portStart + 4,
}
}
// Verify no overlap between consecutive blocks
for i := 0; i < len(blocks)-1; i++ {
if blocks[i].PortEnd >= blocks[i+1].PortStart {
t.Errorf("Block %d (end=%d) overlaps with block %d (start=%d)",
i, blocks[i].PortEnd, i+1, blocks[i+1].PortStart)
}
}
// Verify last block doesn't exceed range
lastBlock := blocks[len(blocks)-1]
if lastBlock.PortEnd > NamespacePortRangeEnd {
t.Errorf("Last block exceeds port range: end=%d, max=%d",
lastBlock.PortEnd, NamespacePortRangeEnd)
}
}
func TestIsConflictError(t *testing.T) {
tests := []struct {
name string
err error
expected bool
}{
{
name: "nil error",
err: nil,
expected: false,
},
{
name: "UNIQUE constraint error",
err: errors.New("UNIQUE constraint failed"),
expected: true,
},
{
name: "constraint violation",
err: errors.New("constraint violation"),
expected: true,
},
{
name: "conflict error",
err: errors.New("conflict detected"),
expected: true,
},
{
name: "regular error",
err: errors.New("connection timeout"),
expected: false,
},
{
name: "empty error",
err: errors.New(""),
expected: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := isConflictError(tt.err)
if result != tt.expected {
t.Errorf("isConflictError(%v) = %v, want %v", tt.err, result, tt.expected)
}
})
}
}
func TestContains(t *testing.T) {
tests := []struct {
s string
substr string
expected bool
}{
{"hello world", "world", true},
{"hello world", "hello", true},
{"hello world", "xyz", false},
{"", "", true},
{"hello", "", true},
{"", "hello", false},
{"UNIQUE constraint", "UNIQUE", true},
}
for _, tt := range tests {
t.Run(tt.s+"_"+tt.substr, func(t *testing.T) {
result := contains(tt.s, tt.substr)
if result != tt.expected {
t.Errorf("contains(%q, %q) = %v, want %v", tt.s, tt.substr, result, tt.expected)
}
})
}
}
func TestNewNamespacePortAllocator(t *testing.T) {
mockDB := newMockRQLiteClient()
logger := zap.NewNop()
allocator := NewNamespacePortAllocator(mockDB, logger)
if allocator == nil {
t.Fatal("NewNamespacePortAllocator returned nil")
}
}
func TestDefaultClusterSizes(t *testing.T) {
// Verify default cluster size constants
if DefaultRQLiteNodeCount != 3 {
t.Errorf("DefaultRQLiteNodeCount = %d, want 3", DefaultRQLiteNodeCount)
}
if DefaultOlricNodeCount != 3 {
t.Errorf("DefaultOlricNodeCount = %d, want 3", DefaultOlricNodeCount)
}
if DefaultGatewayNodeCount != 3 {
t.Errorf("DefaultGatewayNodeCount = %d, want 3", DefaultGatewayNodeCount)
}
// Public namespace should have larger clusters
if PublicRQLiteNodeCount != 5 {
t.Errorf("PublicRQLiteNodeCount = %d, want 5", PublicRQLiteNodeCount)
}
if PublicOlricNodeCount != 5 {
t.Errorf("PublicOlricNodeCount = %d, want 5", PublicOlricNodeCount)
}
}

204
pkg/namespace/types.go Normal file
View File

@ -0,0 +1,204 @@
package namespace
import (
"time"
)
// ClusterStatus represents the current state of a namespace cluster
type ClusterStatus string
const (
ClusterStatusNone ClusterStatus = "none" // No cluster provisioned
ClusterStatusProvisioning ClusterStatus = "provisioning" // Cluster is being provisioned
ClusterStatusReady ClusterStatus = "ready" // Cluster is operational
ClusterStatusDegraded ClusterStatus = "degraded" // Some nodes are unhealthy
ClusterStatusFailed ClusterStatus = "failed" // Cluster failed to provision/operate
ClusterStatusDeprovisioning ClusterStatus = "deprovisioning" // Cluster is being deprovisioned
)
// NodeRole represents the role of a node in a namespace cluster
type NodeRole string
const (
NodeRoleRQLiteLeader NodeRole = "rqlite_leader"
NodeRoleRQLiteFollower NodeRole = "rqlite_follower"
NodeRoleOlric NodeRole = "olric"
NodeRoleGateway NodeRole = "gateway"
)
// NodeStatus represents the status of a service on a node
type NodeStatus string
const (
NodeStatusPending NodeStatus = "pending"
NodeStatusStarting NodeStatus = "starting"
NodeStatusRunning NodeStatus = "running"
NodeStatusStopped NodeStatus = "stopped"
NodeStatusFailed NodeStatus = "failed"
)
// EventType represents types of cluster lifecycle events
type EventType string
const (
EventProvisioningStarted EventType = "provisioning_started"
EventNodesSelected EventType = "nodes_selected"
EventPortsAllocated EventType = "ports_allocated"
EventRQLiteStarted EventType = "rqlite_started"
EventRQLiteJoined EventType = "rqlite_joined"
EventRQLiteLeaderElected EventType = "rqlite_leader_elected"
EventOlricStarted EventType = "olric_started"
EventOlricJoined EventType = "olric_joined"
EventGatewayStarted EventType = "gateway_started"
EventDNSCreated EventType = "dns_created"
EventClusterReady EventType = "cluster_ready"
EventClusterDegraded EventType = "cluster_degraded"
EventClusterFailed EventType = "cluster_failed"
EventNodeFailed EventType = "node_failed"
EventNodeRecovered EventType = "node_recovered"
EventDeprovisionStarted EventType = "deprovisioning_started"
EventDeprovisioned EventType = "deprovisioned"
)
// Port allocation constants
const (
// NamespacePortRangeStart is the beginning of the reserved port range for namespace services
NamespacePortRangeStart = 10000
// NamespacePortRangeEnd is the end of the reserved port range for namespace services
NamespacePortRangeEnd = 10099
// PortsPerNamespace is the number of ports required per namespace instance on a node
// RQLite HTTP (0), RQLite Raft (1), Olric HTTP (2), Olric Memberlist (3), Gateway HTTP (4)
PortsPerNamespace = 5
// MaxNamespacesPerNode is the maximum number of namespace instances a single node can host
MaxNamespacesPerNode = (NamespacePortRangeEnd - NamespacePortRangeStart + 1) / PortsPerNamespace // 20
)
// Default cluster sizes
const (
DefaultRQLiteNodeCount = 3
DefaultOlricNodeCount = 3
DefaultGatewayNodeCount = 3
PublicRQLiteNodeCount = 5
PublicOlricNodeCount = 5
)
// NamespaceCluster represents a dedicated cluster for a namespace
type NamespaceCluster struct {
ID string `json:"id" db:"id"`
NamespaceID int `json:"namespace_id" db:"namespace_id"`
NamespaceName string `json:"namespace_name" db:"namespace_name"`
Status ClusterStatus `json:"status" db:"status"`
RQLiteNodeCount int `json:"rqlite_node_count" db:"rqlite_node_count"`
OlricNodeCount int `json:"olric_node_count" db:"olric_node_count"`
GatewayNodeCount int `json:"gateway_node_count" db:"gateway_node_count"`
ProvisionedBy string `json:"provisioned_by" db:"provisioned_by"`
ProvisionedAt time.Time `json:"provisioned_at" db:"provisioned_at"`
ReadyAt *time.Time `json:"ready_at,omitempty" db:"ready_at"`
LastHealthCheck *time.Time `json:"last_health_check,omitempty" db:"last_health_check"`
ErrorMessage string `json:"error_message,omitempty" db:"error_message"`
RetryCount int `json:"retry_count" db:"retry_count"`
// Populated by queries, not stored directly
Nodes []ClusterNode `json:"nodes,omitempty"`
}
// ClusterNode represents a node participating in a namespace cluster
type ClusterNode struct {
ID string `json:"id" db:"id"`
NamespaceClusterID string `json:"namespace_cluster_id" db:"namespace_cluster_id"`
NodeID string `json:"node_id" db:"node_id"`
Role NodeRole `json:"role" db:"role"`
RQLiteHTTPPort int `json:"rqlite_http_port,omitempty" db:"rqlite_http_port"`
RQLiteRaftPort int `json:"rqlite_raft_port,omitempty" db:"rqlite_raft_port"`
OlricHTTPPort int `json:"olric_http_port,omitempty" db:"olric_http_port"`
OlricMemberlistPort int `json:"olric_memberlist_port,omitempty" db:"olric_memberlist_port"`
GatewayHTTPPort int `json:"gateway_http_port,omitempty" db:"gateway_http_port"`
Status NodeStatus `json:"status" db:"status"`
ProcessPID int `json:"process_pid,omitempty" db:"process_pid"`
LastHeartbeat *time.Time `json:"last_heartbeat,omitempty" db:"last_heartbeat"`
ErrorMessage string `json:"error_message,omitempty" db:"error_message"`
RQLiteJoinAddress string `json:"rqlite_join_address,omitempty" db:"rqlite_join_address"`
OlricPeers string `json:"olric_peers,omitempty" db:"olric_peers"` // JSON array
CreatedAt time.Time `json:"created_at" db:"created_at"`
UpdatedAt time.Time `json:"updated_at" db:"updated_at"`
}
// PortBlock represents an allocated block of ports for a namespace on a node
type PortBlock struct {
ID string `json:"id" db:"id"`
NodeID string `json:"node_id" db:"node_id"`
NamespaceClusterID string `json:"namespace_cluster_id" db:"namespace_cluster_id"`
PortStart int `json:"port_start" db:"port_start"`
PortEnd int `json:"port_end" db:"port_end"`
RQLiteHTTPPort int `json:"rqlite_http_port" db:"rqlite_http_port"`
RQLiteRaftPort int `json:"rqlite_raft_port" db:"rqlite_raft_port"`
OlricHTTPPort int `json:"olric_http_port" db:"olric_http_port"`
OlricMemberlistPort int `json:"olric_memberlist_port" db:"olric_memberlist_port"`
GatewayHTTPPort int `json:"gateway_http_port" db:"gateway_http_port"`
AllocatedAt time.Time `json:"allocated_at" db:"allocated_at"`
}
// ClusterEvent represents an audit event for cluster lifecycle
type ClusterEvent struct {
ID string `json:"id" db:"id"`
NamespaceClusterID string `json:"namespace_cluster_id" db:"namespace_cluster_id"`
EventType EventType `json:"event_type" db:"event_type"`
NodeID string `json:"node_id,omitempty" db:"node_id"`
Message string `json:"message,omitempty" db:"message"`
Metadata string `json:"metadata,omitempty" db:"metadata"` // JSON
CreatedAt time.Time `json:"created_at" db:"created_at"`
}
// ClusterProvisioningStatus is the response format for the /v1/namespace/status endpoint
type ClusterProvisioningStatus struct {
ClusterID string `json:"cluster_id"`
Namespace string `json:"namespace"`
Status ClusterStatus `json:"status"`
Nodes []string `json:"nodes"`
RQLiteReady bool `json:"rqlite_ready"`
OlricReady bool `json:"olric_ready"`
GatewayReady bool `json:"gateway_ready"`
DNSReady bool `json:"dns_ready"`
Error string `json:"error,omitempty"`
CreatedAt time.Time `json:"created_at"`
ReadyAt *time.Time `json:"ready_at,omitempty"`
}
// ProvisioningResponse is returned when a new namespace triggers cluster provisioning
type ProvisioningResponse struct {
Status string `json:"status"`
ClusterID string `json:"cluster_id"`
PollURL string `json:"poll_url"`
EstimatedTimeSeconds int `json:"estimated_time_seconds"`
}
// Errors
type ClusterError struct {
Message string
Cause error
}
func (e *ClusterError) Error() string {
if e.Cause != nil {
return e.Message + ": " + e.Cause.Error()
}
return e.Message
}
func (e *ClusterError) Unwrap() error {
return e.Cause
}
var (
ErrNoPortsAvailable = &ClusterError{Message: "no ports available on node"}
ErrNodeAtCapacity = &ClusterError{Message: "node has reached maximum namespace instances"}
ErrInsufficientNodes = &ClusterError{Message: "insufficient nodes available for cluster"}
ErrClusterNotFound = &ClusterError{Message: "namespace cluster not found"}
ErrClusterAlreadyExists = &ClusterError{Message: "namespace cluster already exists"}
ErrProvisioningFailed = &ClusterError{Message: "cluster provisioning failed"}
ErrNamespaceNotFound = &ClusterError{Message: "namespace not found"}
ErrInvalidClusterStatus = &ClusterError{Message: "invalid cluster status for operation"}
)

405
pkg/namespace/types_test.go Normal file
View File

@ -0,0 +1,405 @@
package namespace
import (
"errors"
"testing"
"time"
)
func TestClusterStatus_Values(t *testing.T) {
// Verify all cluster status values are correct
tests := []struct {
status ClusterStatus
expected string
}{
{ClusterStatusNone, "none"},
{ClusterStatusProvisioning, "provisioning"},
{ClusterStatusReady, "ready"},
{ClusterStatusDegraded, "degraded"},
{ClusterStatusFailed, "failed"},
{ClusterStatusDeprovisioning, "deprovisioning"},
}
for _, tt := range tests {
t.Run(string(tt.status), func(t *testing.T) {
if string(tt.status) != tt.expected {
t.Errorf("ClusterStatus = %s, want %s", tt.status, tt.expected)
}
})
}
}
func TestNodeRole_Values(t *testing.T) {
// Verify all node role values are correct
tests := []struct {
role NodeRole
expected string
}{
{NodeRoleRQLiteLeader, "rqlite_leader"},
{NodeRoleRQLiteFollower, "rqlite_follower"},
{NodeRoleOlric, "olric"},
{NodeRoleGateway, "gateway"},
}
for _, tt := range tests {
t.Run(string(tt.role), func(t *testing.T) {
if string(tt.role) != tt.expected {
t.Errorf("NodeRole = %s, want %s", tt.role, tt.expected)
}
})
}
}
func TestNodeStatus_Values(t *testing.T) {
// Verify all node status values are correct
tests := []struct {
status NodeStatus
expected string
}{
{NodeStatusPending, "pending"},
{NodeStatusStarting, "starting"},
{NodeStatusRunning, "running"},
{NodeStatusStopped, "stopped"},
{NodeStatusFailed, "failed"},
}
for _, tt := range tests {
t.Run(string(tt.status), func(t *testing.T) {
if string(tt.status) != tt.expected {
t.Errorf("NodeStatus = %s, want %s", tt.status, tt.expected)
}
})
}
}
func TestEventType_Values(t *testing.T) {
// Verify all event type values are correct
tests := []struct {
eventType EventType
expected string
}{
{EventProvisioningStarted, "provisioning_started"},
{EventNodesSelected, "nodes_selected"},
{EventPortsAllocated, "ports_allocated"},
{EventRQLiteStarted, "rqlite_started"},
{EventRQLiteJoined, "rqlite_joined"},
{EventRQLiteLeaderElected, "rqlite_leader_elected"},
{EventOlricStarted, "olric_started"},
{EventOlricJoined, "olric_joined"},
{EventGatewayStarted, "gateway_started"},
{EventDNSCreated, "dns_created"},
{EventClusterReady, "cluster_ready"},
{EventClusterDegraded, "cluster_degraded"},
{EventClusterFailed, "cluster_failed"},
{EventNodeFailed, "node_failed"},
{EventNodeRecovered, "node_recovered"},
{EventDeprovisionStarted, "deprovisioning_started"},
{EventDeprovisioned, "deprovisioned"},
}
for _, tt := range tests {
t.Run(string(tt.eventType), func(t *testing.T) {
if string(tt.eventType) != tt.expected {
t.Errorf("EventType = %s, want %s", tt.eventType, tt.expected)
}
})
}
}
func TestClusterError_Error(t *testing.T) {
tests := []struct {
name string
err *ClusterError
expected string
}{
{
name: "message only",
err: &ClusterError{Message: "something failed"},
expected: "something failed",
},
{
name: "message with cause",
err: &ClusterError{Message: "operation failed", Cause: errors.New("connection timeout")},
expected: "operation failed: connection timeout",
},
{
name: "empty message with cause",
err: &ClusterError{Message: "", Cause: errors.New("cause")},
expected: ": cause",
},
{
name: "empty message no cause",
err: &ClusterError{Message: ""},
expected: "",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := tt.err.Error()
if result != tt.expected {
t.Errorf("Error() = %q, want %q", result, tt.expected)
}
})
}
}
func TestClusterError_Unwrap(t *testing.T) {
cause := errors.New("original error")
err := &ClusterError{
Message: "wrapped",
Cause: cause,
}
unwrapped := err.Unwrap()
if unwrapped != cause {
t.Errorf("Unwrap() = %v, want %v", unwrapped, cause)
}
// Test with no cause
errNoCause := &ClusterError{Message: "no cause"}
if errNoCause.Unwrap() != nil {
t.Errorf("Unwrap() with no cause should return nil")
}
}
func TestPredefinedErrors(t *testing.T) {
// Test that predefined errors have the correct messages
tests := []struct {
name string
err *ClusterError
expected string
}{
{"ErrNoPortsAvailable", ErrNoPortsAvailable, "no ports available on node"},
{"ErrNodeAtCapacity", ErrNodeAtCapacity, "node has reached maximum namespace instances"},
{"ErrInsufficientNodes", ErrInsufficientNodes, "insufficient nodes available for cluster"},
{"ErrClusterNotFound", ErrClusterNotFound, "namespace cluster not found"},
{"ErrClusterAlreadyExists", ErrClusterAlreadyExists, "namespace cluster already exists"},
{"ErrProvisioningFailed", ErrProvisioningFailed, "cluster provisioning failed"},
{"ErrNamespaceNotFound", ErrNamespaceNotFound, "namespace not found"},
{"ErrInvalidClusterStatus", ErrInvalidClusterStatus, "invalid cluster status for operation"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if tt.err.Message != tt.expected {
t.Errorf("%s.Message = %q, want %q", tt.name, tt.err.Message, tt.expected)
}
})
}
}
func TestNamespaceCluster_Struct(t *testing.T) {
now := time.Now()
readyAt := now.Add(5 * time.Minute)
cluster := &NamespaceCluster{
ID: "cluster-123",
NamespaceID: 42,
NamespaceName: "test-namespace",
Status: ClusterStatusReady,
RQLiteNodeCount: 3,
OlricNodeCount: 3,
GatewayNodeCount: 3,
ProvisionedBy: "admin",
ProvisionedAt: now,
ReadyAt: &readyAt,
LastHealthCheck: nil,
ErrorMessage: "",
RetryCount: 0,
Nodes: nil,
}
if cluster.ID != "cluster-123" {
t.Errorf("ID = %s, want cluster-123", cluster.ID)
}
if cluster.NamespaceID != 42 {
t.Errorf("NamespaceID = %d, want 42", cluster.NamespaceID)
}
if cluster.Status != ClusterStatusReady {
t.Errorf("Status = %s, want %s", cluster.Status, ClusterStatusReady)
}
if cluster.RQLiteNodeCount != 3 {
t.Errorf("RQLiteNodeCount = %d, want 3", cluster.RQLiteNodeCount)
}
}
func TestClusterNode_Struct(t *testing.T) {
now := time.Now()
heartbeat := now.Add(-30 * time.Second)
node := &ClusterNode{
ID: "node-record-123",
NamespaceClusterID: "cluster-456",
NodeID: "12D3KooWabc123",
Role: NodeRoleRQLiteLeader,
RQLiteHTTPPort: 10000,
RQLiteRaftPort: 10001,
OlricHTTPPort: 10002,
OlricMemberlistPort: 10003,
GatewayHTTPPort: 10004,
Status: NodeStatusRunning,
ProcessPID: 12345,
LastHeartbeat: &heartbeat,
ErrorMessage: "",
RQLiteJoinAddress: "192.168.1.100:10001",
OlricPeers: `["192.168.1.100:10003","192.168.1.101:10003"]`,
CreatedAt: now,
UpdatedAt: now,
}
if node.Role != NodeRoleRQLiteLeader {
t.Errorf("Role = %s, want %s", node.Role, NodeRoleRQLiteLeader)
}
if node.Status != NodeStatusRunning {
t.Errorf("Status = %s, want %s", node.Status, NodeStatusRunning)
}
if node.RQLiteHTTPPort != 10000 {
t.Errorf("RQLiteHTTPPort = %d, want 10000", node.RQLiteHTTPPort)
}
if node.ProcessPID != 12345 {
t.Errorf("ProcessPID = %d, want 12345", node.ProcessPID)
}
}
func TestClusterProvisioningStatus_Struct(t *testing.T) {
now := time.Now()
readyAt := now.Add(2 * time.Minute)
status := &ClusterProvisioningStatus{
ClusterID: "cluster-789",
Namespace: "my-namespace",
Status: ClusterStatusProvisioning,
Nodes: []string{"node-1", "node-2", "node-3"},
RQLiteReady: true,
OlricReady: true,
GatewayReady: false,
DNSReady: false,
Error: "",
CreatedAt: now,
ReadyAt: &readyAt,
}
if status.ClusterID != "cluster-789" {
t.Errorf("ClusterID = %s, want cluster-789", status.ClusterID)
}
if len(status.Nodes) != 3 {
t.Errorf("len(Nodes) = %d, want 3", len(status.Nodes))
}
if !status.RQLiteReady {
t.Error("RQLiteReady should be true")
}
if status.GatewayReady {
t.Error("GatewayReady should be false")
}
}
func TestProvisioningResponse_Struct(t *testing.T) {
resp := &ProvisioningResponse{
Status: "provisioning",
ClusterID: "cluster-abc",
PollURL: "/v1/namespace/status?id=cluster-abc",
EstimatedTimeSeconds: 120,
}
if resp.Status != "provisioning" {
t.Errorf("Status = %s, want provisioning", resp.Status)
}
if resp.ClusterID != "cluster-abc" {
t.Errorf("ClusterID = %s, want cluster-abc", resp.ClusterID)
}
if resp.EstimatedTimeSeconds != 120 {
t.Errorf("EstimatedTimeSeconds = %d, want 120", resp.EstimatedTimeSeconds)
}
}
func TestClusterEvent_Struct(t *testing.T) {
now := time.Now()
event := &ClusterEvent{
ID: "event-123",
NamespaceClusterID: "cluster-456",
EventType: EventClusterReady,
NodeID: "node-1",
Message: "Cluster is now ready",
Metadata: `{"nodes":["node-1","node-2","node-3"]}`,
CreatedAt: now,
}
if event.EventType != EventClusterReady {
t.Errorf("EventType = %s, want %s", event.EventType, EventClusterReady)
}
if event.Message != "Cluster is now ready" {
t.Errorf("Message = %s, want 'Cluster is now ready'", event.Message)
}
}
func TestPortBlock_Struct(t *testing.T) {
now := time.Now()
block := &PortBlock{
ID: "port-block-123",
NodeID: "node-456",
NamespaceClusterID: "cluster-789",
PortStart: 10000,
PortEnd: 10004,
RQLiteHTTPPort: 10000,
RQLiteRaftPort: 10001,
OlricHTTPPort: 10002,
OlricMemberlistPort: 10003,
GatewayHTTPPort: 10004,
AllocatedAt: now,
}
// Verify port calculations
if block.PortEnd-block.PortStart+1 != PortsPerNamespace {
t.Errorf("Port range size = %d, want %d", block.PortEnd-block.PortStart+1, PortsPerNamespace)
}
// Verify each port is within the block
ports := []int{
block.RQLiteHTTPPort,
block.RQLiteRaftPort,
block.OlricHTTPPort,
block.OlricMemberlistPort,
block.GatewayHTTPPort,
}
for i, port := range ports {
if port < block.PortStart || port > block.PortEnd {
t.Errorf("Port %d (%d) is outside block range [%d, %d]",
i, port, block.PortStart, block.PortEnd)
}
}
}
func TestErrorsImplementError(t *testing.T) {
// Verify ClusterError implements error interface
var _ error = &ClusterError{}
err := &ClusterError{Message: "test error"}
var errInterface error = err
if errInterface.Error() != "test error" {
t.Errorf("error interface Error() = %s, want 'test error'", errInterface.Error())
}
}
func TestErrorsUnwrap(t *testing.T) {
// Test errors.Is/errors.As compatibility
cause := errors.New("root cause")
err := &ClusterError{
Message: "wrapper",
Cause: cause,
}
if !errors.Is(err, cause) {
t.Error("errors.Is should find the wrapped cause")
}
// Test unwrap chain
unwrapped := errors.Unwrap(err)
if unwrapped != cause {
t.Error("errors.Unwrap should return the cause")
}
}

View File

@ -0,0 +1,488 @@
package olric
import (
"context"
"fmt"
"net/http"
"os"
"os/exec"
"path/filepath"
"strings"
"sync"
"time"
"github.com/DeBrosOfficial/network/pkg/tlsutil"
"go.uber.org/zap"
"gopkg.in/yaml.v3"
)
// InstanceNodeStatus represents the status of an instance (local type to avoid import cycle)
type InstanceNodeStatus string
const (
InstanceStatusPending InstanceNodeStatus = "pending"
InstanceStatusStarting InstanceNodeStatus = "starting"
InstanceStatusRunning InstanceNodeStatus = "running"
InstanceStatusStopped InstanceNodeStatus = "stopped"
InstanceStatusFailed InstanceNodeStatus = "failed"
)
// InstanceError represents an error during instance operations (local type to avoid import cycle)
type InstanceError struct {
Message string
Cause error
}
func (e *InstanceError) Error() string {
if e.Cause != nil {
return e.Message + ": " + e.Cause.Error()
}
return e.Message
}
func (e *InstanceError) Unwrap() error {
return e.Cause
}
// InstanceSpawner manages multiple Olric instances for namespace clusters.
// Each namespace gets its own Olric cluster with dedicated ports and memberlist.
type InstanceSpawner struct {
logger *zap.Logger
baseDir string // Base directory for all namespace data (e.g., ~/.orama/data/namespaces)
instances map[string]*OlricInstance
mu sync.RWMutex
}
// OlricInstance represents a running Olric instance for a namespace
type OlricInstance struct {
Namespace string
NodeID string
HTTPPort int
MemberlistPort int
BindAddr string
AdvertiseAddr string
PeerAddresses []string // Memberlist peer addresses for cluster discovery
ConfigPath string
DataDir string
PID int
Status InstanceNodeStatus
StartedAt time.Time
LastHealthCheck time.Time
cmd *exec.Cmd
logger *zap.Logger
}
// InstanceConfig holds configuration for spawning an Olric instance
type InstanceConfig struct {
Namespace string // Namespace name (e.g., "alice")
NodeID string // Physical node ID
HTTPPort int // HTTP API port
MemberlistPort int // Memberlist gossip port
BindAddr string // Address to bind (e.g., "0.0.0.0")
AdvertiseAddr string // Address to advertise (e.g., "192.168.1.10")
PeerAddresses []string // Memberlist peer addresses for initial cluster join
}
// OlricConfig represents the Olric YAML configuration structure
type OlricConfig struct {
Server OlricServerConfig `yaml:"server"`
Memberlist OlricMemberlistConfig `yaml:"memberlist"`
}
// OlricServerConfig represents the server section of Olric config
type OlricServerConfig struct {
BindAddr string `yaml:"bindAddr"`
BindPort int `yaml:"bindPort"`
}
// OlricMemberlistConfig represents the memberlist section of Olric config
type OlricMemberlistConfig struct {
Environment string `yaml:"environment"`
BindAddr string `yaml:"bindAddr"`
BindPort int `yaml:"bindPort"`
Peers []string `yaml:"peers,omitempty"`
}
// NewInstanceSpawner creates a new Olric instance spawner
func NewInstanceSpawner(baseDir string, logger *zap.Logger) *InstanceSpawner {
return &InstanceSpawner{
logger: logger.With(zap.String("component", "olric-instance-spawner")),
baseDir: baseDir,
instances: make(map[string]*OlricInstance),
}
}
// instanceKey generates a unique key for an instance based on namespace and node
func instanceKey(namespace, nodeID string) string {
return fmt.Sprintf("%s:%s", namespace, nodeID)
}
// SpawnInstance starts a new Olric instance for a namespace on a specific node.
// Returns the instance info or an error if spawning fails.
func (is *InstanceSpawner) SpawnInstance(ctx context.Context, cfg InstanceConfig) (*OlricInstance, error) {
key := instanceKey(cfg.Namespace, cfg.NodeID)
is.mu.Lock()
if existing, ok := is.instances[key]; ok {
is.mu.Unlock()
// Instance already exists, return it if running
if existing.Status == InstanceStatusRunning {
return existing, nil
}
// Otherwise, remove it and start fresh
is.mu.Lock()
delete(is.instances, key)
}
is.mu.Unlock()
// Create data and config directories
dataDir := filepath.Join(is.baseDir, cfg.Namespace, "olric", cfg.NodeID)
configDir := filepath.Join(is.baseDir, cfg.Namespace, "configs")
logsDir := filepath.Join(is.baseDir, cfg.Namespace, "logs")
for _, dir := range []string{dataDir, configDir, logsDir} {
if err := os.MkdirAll(dir, 0755); err != nil {
return nil, &InstanceError{
Message: fmt.Sprintf("failed to create directory %s", dir),
Cause: err,
}
}
}
// Generate config file
configPath := filepath.Join(configDir, fmt.Sprintf("olric-%s.yaml", cfg.NodeID))
if err := is.generateConfig(configPath, cfg); err != nil {
return nil, err
}
instance := &OlricInstance{
Namespace: cfg.Namespace,
NodeID: cfg.NodeID,
HTTPPort: cfg.HTTPPort,
MemberlistPort: cfg.MemberlistPort,
BindAddr: cfg.BindAddr,
AdvertiseAddr: cfg.AdvertiseAddr,
PeerAddresses: cfg.PeerAddresses,
ConfigPath: configPath,
DataDir: dataDir,
Status: InstanceStatusStarting,
logger: is.logger.With(zap.String("namespace", cfg.Namespace), zap.String("node_id", cfg.NodeID)),
}
instance.logger.Info("Starting Olric instance",
zap.Int("http_port", cfg.HTTPPort),
zap.Int("memberlist_port", cfg.MemberlistPort),
zap.Strings("peers", cfg.PeerAddresses),
)
// Create command with config environment variable
cmd := exec.CommandContext(ctx, "olric-server")
cmd.Env = append(os.Environ(), fmt.Sprintf("OLRIC_SERVER_CONFIG=%s", configPath))
instance.cmd = cmd
// Setup logging
logPath := filepath.Join(logsDir, fmt.Sprintf("olric-%s.log", cfg.NodeID))
logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
if err != nil {
return nil, &InstanceError{
Message: "failed to open log file",
Cause: err,
}
}
cmd.Stdout = logFile
cmd.Stderr = logFile
// Start the process
if err := cmd.Start(); err != nil {
logFile.Close()
return nil, &InstanceError{
Message: "failed to start Olric process",
Cause: err,
}
}
logFile.Close()
instance.PID = cmd.Process.Pid
instance.StartedAt = time.Now()
// Store instance
is.mu.Lock()
is.instances[key] = instance
is.mu.Unlock()
// Wait for instance to be ready
if err := is.waitForInstanceReady(ctx, instance); err != nil {
// Kill the process on failure
if cmd.Process != nil {
_ = cmd.Process.Kill()
}
is.mu.Lock()
delete(is.instances, key)
is.mu.Unlock()
return nil, &InstanceError{
Message: "Olric instance did not become ready",
Cause: err,
}
}
instance.Status = InstanceStatusRunning
instance.LastHealthCheck = time.Now()
instance.logger.Info("Olric instance started successfully",
zap.Int("pid", instance.PID),
)
// Start background process monitor
go is.monitorInstance(instance)
return instance, nil
}
// generateConfig generates the Olric YAML configuration file
func (is *InstanceSpawner) generateConfig(configPath string, cfg InstanceConfig) error {
// Use "lan" environment for namespace clusters (low latency expected)
olricCfg := OlricConfig{
Server: OlricServerConfig{
BindAddr: cfg.BindAddr,
BindPort: cfg.HTTPPort,
},
Memberlist: OlricMemberlistConfig{
Environment: "lan",
BindAddr: cfg.BindAddr,
BindPort: cfg.MemberlistPort,
Peers: cfg.PeerAddresses,
},
}
data, err := yaml.Marshal(olricCfg)
if err != nil {
return &InstanceError{
Message: "failed to marshal Olric config",
Cause: err,
}
}
if err := os.WriteFile(configPath, data, 0644); err != nil {
return &InstanceError{
Message: "failed to write Olric config",
Cause: err,
}
}
return nil
}
// StopInstance stops an Olric instance for a namespace on a specific node
func (is *InstanceSpawner) StopInstance(ctx context.Context, ns, nodeID string) error {
key := instanceKey(ns, nodeID)
is.mu.Lock()
instance, ok := is.instances[key]
if !ok {
is.mu.Unlock()
return nil // Already stopped
}
delete(is.instances, key)
is.mu.Unlock()
if instance.cmd != nil && instance.cmd.Process != nil {
instance.logger.Info("Stopping Olric instance", zap.Int("pid", instance.PID))
// Send SIGTERM for graceful shutdown
if err := instance.cmd.Process.Signal(os.Interrupt); err != nil {
// If SIGTERM fails, kill it
_ = instance.cmd.Process.Kill()
}
// Wait for process to exit with timeout
done := make(chan error, 1)
go func() {
done <- instance.cmd.Wait()
}()
select {
case <-done:
instance.logger.Info("Olric instance stopped gracefully")
case <-time.After(10 * time.Second):
instance.logger.Warn("Olric instance did not stop gracefully, killing")
_ = instance.cmd.Process.Kill()
case <-ctx.Done():
_ = instance.cmd.Process.Kill()
return ctx.Err()
}
}
instance.Status = InstanceStatusStopped
return nil
}
// StopAllInstances stops all Olric instances for a namespace
func (is *InstanceSpawner) StopAllInstances(ctx context.Context, ns string) error {
is.mu.RLock()
var keys []string
for key, inst := range is.instances {
if inst.Namespace == ns {
keys = append(keys, key)
}
}
is.mu.RUnlock()
var lastErr error
for _, key := range keys {
parts := strings.SplitN(key, ":", 2)
if len(parts) == 2 {
if err := is.StopInstance(ctx, parts[0], parts[1]); err != nil {
lastErr = err
}
}
}
return lastErr
}
// GetInstance returns the instance for a namespace on a specific node
func (is *InstanceSpawner) GetInstance(ns, nodeID string) (*OlricInstance, bool) {
is.mu.RLock()
defer is.mu.RUnlock()
instance, ok := is.instances[instanceKey(ns, nodeID)]
return instance, ok
}
// GetNamespaceInstances returns all instances for a namespace
func (is *InstanceSpawner) GetNamespaceInstances(ns string) []*OlricInstance {
is.mu.RLock()
defer is.mu.RUnlock()
var instances []*OlricInstance
for _, inst := range is.instances {
if inst.Namespace == ns {
instances = append(instances, inst)
}
}
return instances
}
// HealthCheck checks if an instance is healthy
func (is *InstanceSpawner) HealthCheck(ctx context.Context, ns, nodeID string) (bool, error) {
instance, ok := is.GetInstance(ns, nodeID)
if !ok {
return false, &InstanceError{Message: "instance not found"}
}
healthy, err := instance.IsHealthy(ctx)
if healthy {
is.mu.Lock()
instance.LastHealthCheck = time.Now()
is.mu.Unlock()
}
return healthy, err
}
// waitForInstanceReady waits for the Olric instance to be ready
func (is *InstanceSpawner) waitForInstanceReady(ctx context.Context, instance *OlricInstance) error {
client := tlsutil.NewHTTPClient(2 * time.Second)
// Olric health check endpoint
url := fmt.Sprintf("http://localhost:%d/ready", instance.HTTPPort)
maxAttempts := 120 // 2 minutes
for i := 0; i < maxAttempts; i++ {
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(1 * time.Second):
}
resp, err := client.Get(url)
if err != nil {
continue
}
resp.Body.Close()
if resp.StatusCode == http.StatusOK {
instance.logger.Debug("Olric instance ready",
zap.Int("attempts", i+1),
)
return nil
}
}
return fmt.Errorf("Olric did not become ready within timeout")
}
// monitorInstance monitors an instance and updates its status
func (is *InstanceSpawner) monitorInstance(instance *OlricInstance) {
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
for range ticker.C {
is.mu.RLock()
key := instanceKey(instance.Namespace, instance.NodeID)
_, exists := is.instances[key]
is.mu.RUnlock()
if !exists {
// Instance was removed
return
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
healthy, _ := instance.IsHealthy(ctx)
cancel()
is.mu.Lock()
if healthy {
instance.Status = InstanceStatusRunning
instance.LastHealthCheck = time.Now()
} else {
instance.Status = InstanceStatusFailed
instance.logger.Warn("Olric instance health check failed")
}
is.mu.Unlock()
// Check if process is still running
if instance.cmd != nil && instance.cmd.ProcessState != nil && instance.cmd.ProcessState.Exited() {
is.mu.Lock()
instance.Status = InstanceStatusStopped
is.mu.Unlock()
instance.logger.Warn("Olric instance process exited unexpectedly")
return
}
}
}
// IsHealthy checks if the Olric instance is healthy
func (oi *OlricInstance) IsHealthy(ctx context.Context) (bool, error) {
url := fmt.Sprintf("http://localhost:%d/ready", oi.HTTPPort)
client := tlsutil.NewHTTPClient(5 * time.Second)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return false, err
}
resp, err := client.Do(req)
if err != nil {
return false, err
}
defer resp.Body.Close()
return resp.StatusCode == http.StatusOK, nil
}
// DSN returns the connection address for this Olric instance
func (oi *OlricInstance) DSN() string {
return fmt.Sprintf("localhost:%d", oi.HTTPPort)
}
// AdvertisedDSN returns the advertised connection address
func (oi *OlricInstance) AdvertisedDSN() string {
return fmt.Sprintf("%s:%d", oi.AdvertiseAddr, oi.HTTPPort)
}
// MemberlistAddress returns the memberlist address for cluster communication
func (oi *OlricInstance) MemberlistAddress() string {
return fmt.Sprintf("%s:%d", oi.AdvertiseAddr, oi.MemberlistPort)
}

View File

@ -0,0 +1,586 @@
package rqlite
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"os/exec"
"path/filepath"
"strings"
"sync"
"time"
"github.com/DeBrosOfficial/network/pkg/tlsutil"
"go.uber.org/zap"
)
// InstanceNodeStatus represents the status of an instance (local type to avoid import cycle)
type InstanceNodeStatus string
const (
InstanceStatusPending InstanceNodeStatus = "pending"
InstanceStatusStarting InstanceNodeStatus = "starting"
InstanceStatusRunning InstanceNodeStatus = "running"
InstanceStatusStopped InstanceNodeStatus = "stopped"
InstanceStatusFailed InstanceNodeStatus = "failed"
)
// InstanceError represents an error during instance operations (local type to avoid import cycle)
type InstanceError struct {
Message string
Cause error
}
func (e *InstanceError) Error() string {
if e.Cause != nil {
return e.Message + ": " + e.Cause.Error()
}
return e.Message
}
func (e *InstanceError) Unwrap() error {
return e.Cause
}
// InstanceSpawner manages multiple RQLite instances for namespace clusters.
// Each namespace gets its own RQLite cluster with dedicated ports and data directories.
type InstanceSpawner struct {
logger *zap.Logger
baseDir string // Base directory for all namespace data (e.g., ~/.orama/data/namespaces)
instances map[string]*RQLiteInstance
mu sync.RWMutex
}
// RQLiteInstance represents a running RQLite instance for a namespace
type RQLiteInstance struct {
Namespace string
NodeID string
HTTPPort int
RaftPort int
HTTPAdvAddress string
RaftAdvAddress string
JoinAddresses []string
DataDir string
IsLeader bool
PID int
Status InstanceNodeStatus
StartedAt time.Time
LastHealthCheck time.Time
cmd *exec.Cmd
logger *zap.Logger
}
// InstanceConfig holds configuration for spawning an RQLite instance
type InstanceConfig struct {
Namespace string // Namespace name (e.g., "alice")
NodeID string // Physical node ID
HTTPPort int // HTTP API port
RaftPort int // Raft consensus port
HTTPAdvAddress string // Advertised HTTP address (e.g., "192.168.1.10:10000")
RaftAdvAddress string // Advertised Raft address (e.g., "192.168.1.10:10001")
JoinAddresses []string // Addresses of existing cluster members to join
IsLeader bool // Whether this is the initial leader node
}
// NewInstanceSpawner creates a new RQLite instance spawner
func NewInstanceSpawner(baseDir string, logger *zap.Logger) *InstanceSpawner {
return &InstanceSpawner{
logger: logger.With(zap.String("component", "rqlite-instance-spawner")),
baseDir: baseDir,
instances: make(map[string]*RQLiteInstance),
}
}
// instanceKey generates a unique key for an instance based on namespace and node
func instanceKey(namespace, nodeID string) string {
return fmt.Sprintf("%s:%s", namespace, nodeID)
}
// SpawnInstance starts a new RQLite instance for a namespace on a specific node.
// Returns the instance info or an error if spawning fails.
func (is *InstanceSpawner) SpawnInstance(ctx context.Context, cfg InstanceConfig) (*RQLiteInstance, error) {
key := instanceKey(cfg.Namespace, cfg.NodeID)
is.mu.Lock()
if existing, ok := is.instances[key]; ok {
is.mu.Unlock()
// Instance already exists, return it if running
if existing.Status == InstanceStatusRunning {
return existing, nil
}
// Otherwise, remove it and start fresh
is.mu.Lock()
delete(is.instances, key)
}
is.mu.Unlock()
// Create data directory
dataDir := filepath.Join(is.baseDir, cfg.Namespace, "rqlite", cfg.NodeID)
if err := os.MkdirAll(dataDir, 0755); err != nil {
return nil, &InstanceError{
Message: "failed to create data directory",
Cause: err,
}
}
// Create logs directory
logsDir := filepath.Join(is.baseDir, cfg.Namespace, "logs")
if err := os.MkdirAll(logsDir, 0755); err != nil {
return nil, &InstanceError{
Message: "failed to create logs directory",
Cause: err,
}
}
instance := &RQLiteInstance{
Namespace: cfg.Namespace,
NodeID: cfg.NodeID,
HTTPPort: cfg.HTTPPort,
RaftPort: cfg.RaftPort,
HTTPAdvAddress: cfg.HTTPAdvAddress,
RaftAdvAddress: cfg.RaftAdvAddress,
JoinAddresses: cfg.JoinAddresses,
DataDir: dataDir,
IsLeader: cfg.IsLeader,
Status: InstanceStatusStarting,
logger: is.logger.With(zap.String("namespace", cfg.Namespace), zap.String("node_id", cfg.NodeID)),
}
// Build command arguments
args := []string{
"-http-addr", fmt.Sprintf("0.0.0.0:%d", cfg.HTTPPort),
"-http-adv-addr", cfg.HTTPAdvAddress,
"-raft-addr", fmt.Sprintf("0.0.0.0:%d", cfg.RaftPort),
"-raft-adv-addr", cfg.RaftAdvAddress,
}
// Handle cluster joining
if len(cfg.JoinAddresses) > 0 && !cfg.IsLeader {
// Remove peers.json if it exists to avoid stale cluster state
peersJSONPath := filepath.Join(dataDir, "raft", "peers.json")
if _, err := os.Stat(peersJSONPath); err == nil {
instance.logger.Debug("Removing existing peers.json before joining cluster",
zap.String("path", peersJSONPath))
_ = os.Remove(peersJSONPath)
}
// Prepare join addresses (strip http:// prefix if present)
joinAddrs := make([]string, 0, len(cfg.JoinAddresses))
for _, addr := range cfg.JoinAddresses {
addr = strings.TrimPrefix(addr, "http://")
addr = strings.TrimPrefix(addr, "https://")
joinAddrs = append(joinAddrs, addr)
}
// Wait for join targets to be available
if err := is.waitForJoinTargets(ctx, cfg.JoinAddresses); err != nil {
instance.logger.Warn("Join targets not all reachable, will still attempt join",
zap.Error(err))
}
args = append(args,
"-join", strings.Join(joinAddrs, ","),
"-join-as", cfg.RaftAdvAddress,
"-join-attempts", "30",
"-join-interval", "10s",
)
}
// Add data directory as final argument
args = append(args, dataDir)
instance.logger.Info("Starting RQLite instance",
zap.Int("http_port", cfg.HTTPPort),
zap.Int("raft_port", cfg.RaftPort),
zap.Strings("join_addresses", cfg.JoinAddresses),
zap.Bool("is_leader", cfg.IsLeader),
)
// Create command
cmd := exec.CommandContext(ctx, "rqlited", args...)
instance.cmd = cmd
// Setup logging
logPath := filepath.Join(logsDir, fmt.Sprintf("rqlite-%s.log", cfg.NodeID))
logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
if err != nil {
return nil, &InstanceError{
Message: "failed to open log file",
Cause: err,
}
}
cmd.Stdout = logFile
cmd.Stderr = logFile
// Start the process
if err := cmd.Start(); err != nil {
logFile.Close()
return nil, &InstanceError{
Message: "failed to start RQLite process",
Cause: err,
}
}
logFile.Close()
instance.PID = cmd.Process.Pid
instance.StartedAt = time.Now()
// Store instance
is.mu.Lock()
is.instances[key] = instance
is.mu.Unlock()
// Wait for instance to be ready
if err := is.waitForInstanceReady(ctx, instance); err != nil {
// Kill the process on failure
if cmd.Process != nil {
_ = cmd.Process.Kill()
}
is.mu.Lock()
delete(is.instances, key)
is.mu.Unlock()
return nil, &InstanceError{
Message: "RQLite instance did not become ready",
Cause: err,
}
}
instance.Status = InstanceStatusRunning
instance.LastHealthCheck = time.Now()
instance.logger.Info("RQLite instance started successfully",
zap.Int("pid", instance.PID),
)
// Start background process monitor
go is.monitorInstance(instance)
return instance, nil
}
// StopInstance stops an RQLite instance for a namespace on a specific node
func (is *InstanceSpawner) StopInstance(ctx context.Context, namespace, nodeID string) error {
key := instanceKey(namespace, nodeID)
is.mu.Lock()
instance, ok := is.instances[key]
if !ok {
is.mu.Unlock()
return nil // Already stopped
}
delete(is.instances, key)
is.mu.Unlock()
if instance.cmd != nil && instance.cmd.Process != nil {
instance.logger.Info("Stopping RQLite instance", zap.Int("pid", instance.PID))
// Send SIGTERM for graceful shutdown
if err := instance.cmd.Process.Signal(os.Interrupt); err != nil {
// If SIGTERM fails, kill it
_ = instance.cmd.Process.Kill()
}
// Wait for process to exit with timeout
done := make(chan error, 1)
go func() {
done <- instance.cmd.Wait()
}()
select {
case <-done:
instance.logger.Info("RQLite instance stopped gracefully")
case <-time.After(10 * time.Second):
instance.logger.Warn("RQLite instance did not stop gracefully, killing")
_ = instance.cmd.Process.Kill()
case <-ctx.Done():
_ = instance.cmd.Process.Kill()
return ctx.Err()
}
}
instance.Status = InstanceStatusStopped
return nil
}
// StopAllInstances stops all RQLite instances for a namespace
func (is *InstanceSpawner) StopAllInstances(ctx context.Context, ns string) error {
is.mu.RLock()
var keys []string
for key, inst := range is.instances {
if inst.Namespace == ns {
keys = append(keys, key)
}
}
is.mu.RUnlock()
var lastErr error
for _, key := range keys {
parts := strings.SplitN(key, ":", 2)
if len(parts) == 2 {
if err := is.StopInstance(ctx, parts[0], parts[1]); err != nil {
lastErr = err
}
}
}
return lastErr
}
// GetInstance returns the instance for a namespace on a specific node
func (is *InstanceSpawner) GetInstance(namespace, nodeID string) (*RQLiteInstance, bool) {
is.mu.RLock()
defer is.mu.RUnlock()
instance, ok := is.instances[instanceKey(namespace, nodeID)]
return instance, ok
}
// GetNamespaceInstances returns all instances for a namespace
func (is *InstanceSpawner) GetNamespaceInstances(ns string) []*RQLiteInstance {
is.mu.RLock()
defer is.mu.RUnlock()
var instances []*RQLiteInstance
for _, inst := range is.instances {
if inst.Namespace == ns {
instances = append(instances, inst)
}
}
return instances
}
// HealthCheck checks if an instance is healthy
func (is *InstanceSpawner) HealthCheck(ctx context.Context, namespace, nodeID string) (bool, error) {
instance, ok := is.GetInstance(namespace, nodeID)
if !ok {
return false, &InstanceError{Message: "instance not found"}
}
healthy, err := instance.IsHealthy(ctx)
if healthy {
is.mu.Lock()
instance.LastHealthCheck = time.Now()
is.mu.Unlock()
}
return healthy, err
}
// waitForJoinTargets waits for join target nodes to be reachable
func (is *InstanceSpawner) waitForJoinTargets(ctx context.Context, joinAddresses []string) error {
timeout := 2 * time.Minute
deadline := time.Now().Add(timeout)
client := tlsutil.NewHTTPClient(5 * time.Second)
for time.Now().Before(deadline) {
allReachable := true
for _, addr := range joinAddresses {
statusURL := addr
if !strings.HasPrefix(addr, "http") {
statusURL = "http://" + addr
}
statusURL = strings.TrimRight(statusURL, "/") + "/status"
resp, err := client.Get(statusURL)
if err != nil {
allReachable = false
break
}
resp.Body.Close()
if resp.StatusCode != http.StatusOK {
allReachable = false
break
}
}
if allReachable {
return nil
}
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(2 * time.Second):
}
}
return fmt.Errorf("join targets not reachable within timeout")
}
// waitForInstanceReady waits for the RQLite instance to be ready
func (is *InstanceSpawner) waitForInstanceReady(ctx context.Context, instance *RQLiteInstance) error {
url := fmt.Sprintf("http://localhost:%d/status", instance.HTTPPort)
client := tlsutil.NewHTTPClient(2 * time.Second)
// Longer timeout for joining nodes as they need to sync
maxAttempts := 180 // 3 minutes
if len(instance.JoinAddresses) > 0 {
maxAttempts = 300 // 5 minutes for joiners
}
for i := 0; i < maxAttempts; i++ {
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(1 * time.Second):
}
resp, err := client.Get(url)
if err != nil {
continue
}
if resp.StatusCode == http.StatusOK {
body, _ := io.ReadAll(resp.Body)
resp.Body.Close()
var statusResp map[string]interface{}
if err := json.Unmarshal(body, &statusResp); err == nil {
if raft, ok := statusResp["raft"].(map[string]interface{}); ok {
state, _ := raft["state"].(string)
if state == "leader" || state == "follower" {
instance.logger.Debug("RQLite instance ready",
zap.String("state", state),
zap.Int("attempts", i+1),
)
return nil
}
} else {
// Backwards compatibility - if no raft status, consider ready
return nil
}
}
}
resp.Body.Close()
}
return fmt.Errorf("RQLite did not become ready within timeout")
}
// monitorInstance monitors an instance and updates its status
func (is *InstanceSpawner) monitorInstance(instance *RQLiteInstance) {
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
for range ticker.C {
is.mu.RLock()
key := instanceKey(instance.Namespace, instance.NodeID)
_, exists := is.instances[key]
is.mu.RUnlock()
if !exists {
// Instance was removed
return
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
healthy, _ := instance.IsHealthy(ctx)
cancel()
is.mu.Lock()
if healthy {
instance.Status = InstanceStatusRunning
instance.LastHealthCheck = time.Now()
} else {
instance.Status = InstanceStatusFailed
instance.logger.Warn("RQLite instance health check failed")
}
is.mu.Unlock()
// Check if process is still running
if instance.cmd != nil && instance.cmd.ProcessState != nil && instance.cmd.ProcessState.Exited() {
is.mu.Lock()
instance.Status = InstanceStatusStopped
is.mu.Unlock()
instance.logger.Warn("RQLite instance process exited unexpectedly")
return
}
}
}
// IsHealthy checks if the RQLite instance is healthy
func (ri *RQLiteInstance) IsHealthy(ctx context.Context) (bool, error) {
url := fmt.Sprintf("http://localhost:%d/status", ri.HTTPPort)
client := tlsutil.NewHTTPClient(5 * time.Second)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return false, err
}
resp, err := client.Do(req)
if err != nil {
return false, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return false, fmt.Errorf("status endpoint returned %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return false, err
}
var statusResp map[string]interface{}
if err := json.Unmarshal(body, &statusResp); err != nil {
return false, err
}
if raft, ok := statusResp["raft"].(map[string]interface{}); ok {
state, _ := raft["state"].(string)
return state == "leader" || state == "follower", nil
}
// Backwards compatibility
return true, nil
}
// GetLeaderAddress returns the leader's address for the cluster
func (ri *RQLiteInstance) GetLeaderAddress(ctx context.Context) (string, error) {
url := fmt.Sprintf("http://localhost:%d/status", ri.HTTPPort)
client := tlsutil.NewHTTPClient(5 * time.Second)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return "", err
}
resp, err := client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
var statusResp map[string]interface{}
if err := json.Unmarshal(body, &statusResp); err != nil {
return "", err
}
if raft, ok := statusResp["raft"].(map[string]interface{}); ok {
if leader, ok := raft["leader_addr"].(string); ok {
return leader, nil
}
}
return "", fmt.Errorf("leader address not found in status response")
}
// DSN returns the connection string for this RQLite instance
func (ri *RQLiteInstance) DSN() string {
return fmt.Sprintf("http://localhost:%d", ri.HTTPPort)
}
// AdvertisedDSN returns the advertised connection string for cluster communication
func (ri *RQLiteInstance) AdvertisedDSN() string {
return fmt.Sprintf("http://%s", ri.HTTPAdvAddress)
}