network/pkg/namespace/node_selector.go

386 lines
11 KiB
Go

package namespace
import (
"context"
"sort"
"time"
"github.com/DeBrosOfficial/network/pkg/client"
"github.com/DeBrosOfficial/network/pkg/rqlite"
"go.uber.org/zap"
)
// ClusterNodeSelector selects optimal nodes for namespace clusters.
// It extends the existing capacity scoring system from deployments/home_node.go
// to select multiple nodes based on available capacity.
type ClusterNodeSelector struct {
db rqlite.Client
portAllocator *NamespacePortAllocator
logger *zap.Logger
}
// NodeCapacity represents the capacity metrics for a single node
type NodeCapacity struct {
NodeID string `json:"node_id"`
IPAddress string `json:"ip_address"`
InternalIP string `json:"internal_ip"` // WireGuard IP for inter-node communication
DeploymentCount int `json:"deployment_count"`
AllocatedPorts int `json:"allocated_ports"`
AvailablePorts int `json:"available_ports"`
UsedMemoryMB int `json:"used_memory_mb"`
AvailableMemoryMB int `json:"available_memory_mb"`
UsedCPUPercent int `json:"used_cpu_percent"`
NamespaceInstanceCount int `json:"namespace_instance_count"` // Number of namespace clusters on this node
AvailableNamespaceSlots int `json:"available_namespace_slots"` // How many more namespace instances can fit
Score float64 `json:"score"`
}
// NewClusterNodeSelector creates a new node selector
func NewClusterNodeSelector(db rqlite.Client, portAllocator *NamespacePortAllocator, logger *zap.Logger) *ClusterNodeSelector {
return &ClusterNodeSelector{
db: db,
portAllocator: portAllocator,
logger: logger.With(zap.String("component", "cluster-node-selector")),
}
}
// SelectNodesForCluster selects the optimal N nodes for a new namespace cluster.
// Returns the node IDs sorted by score (best first).
func (cns *ClusterNodeSelector) SelectNodesForCluster(ctx context.Context, nodeCount int) ([]NodeCapacity, error) {
internalCtx := client.WithInternalAuth(ctx)
// Get all active nodes
activeNodes, err := cns.getActiveNodes(internalCtx)
if err != nil {
return nil, err
}
cns.logger.Debug("Found active nodes", zap.Int("count", len(activeNodes)))
// Filter nodes that have capacity for namespace instances
eligibleNodes := make([]NodeCapacity, 0)
for _, node := range activeNodes {
capacity, err := cns.getNodeCapacity(internalCtx, node.NodeID, node.IPAddress, node.InternalIP)
if err != nil {
cns.logger.Warn("Failed to get node capacity, skipping",
zap.String("node_id", node.NodeID),
zap.Error(err),
)
continue
}
// Only include nodes with available namespace slots
if capacity.AvailableNamespaceSlots > 0 {
eligibleNodes = append(eligibleNodes, *capacity)
} else {
cns.logger.Debug("Node at capacity, skipping",
zap.String("node_id", node.NodeID),
zap.Int("namespace_instances", capacity.NamespaceInstanceCount),
)
}
}
cns.logger.Debug("Eligible nodes after filtering", zap.Int("count", len(eligibleNodes)))
// Check if we have enough nodes
if len(eligibleNodes) < nodeCount {
return nil, &ClusterError{
Message: ErrInsufficientNodes.Message,
Cause: nil,
}
}
// Sort by score (highest first)
sort.Slice(eligibleNodes, func(i, j int) bool {
return eligibleNodes[i].Score > eligibleNodes[j].Score
})
// Return top N nodes
selectedNodes := eligibleNodes[:nodeCount]
cns.logger.Info("Selected nodes for cluster",
zap.Int("requested", nodeCount),
zap.Int("selected", len(selectedNodes)),
)
for i, node := range selectedNodes {
cns.logger.Debug("Selected node",
zap.Int("rank", i+1),
zap.String("node_id", node.NodeID),
zap.Float64("score", node.Score),
zap.Int("namespace_instances", node.NamespaceInstanceCount),
zap.Int("available_slots", node.AvailableNamespaceSlots),
)
}
return selectedNodes, nil
}
// nodeInfo is used for querying active nodes
type nodeInfo struct {
NodeID string `db:"id"`
IPAddress string `db:"ip_address"`
InternalIP string `db:"internal_ip"`
}
// getActiveNodes retrieves all active nodes from dns_nodes table
func (cns *ClusterNodeSelector) getActiveNodes(ctx context.Context) ([]nodeInfo, error) {
// Nodes must have checked in within last 2 minutes
cutoff := time.Now().Add(-2 * time.Minute)
var results []nodeInfo
query := `
SELECT id, ip_address, COALESCE(internal_ip, ip_address) as internal_ip FROM dns_nodes
WHERE status = 'active' AND last_seen > ?
ORDER BY id
`
err := cns.db.Query(ctx, &results, query, cutoff.Format("2006-01-02 15:04:05"))
if err != nil {
return nil, &ClusterError{
Message: "failed to query active nodes",
Cause: err,
}
}
cns.logger.Debug("Found active nodes",
zap.Int("count", len(results)),
)
return results, nil
}
// getNodeCapacity calculates capacity metrics for a single node
func (cns *ClusterNodeSelector) getNodeCapacity(ctx context.Context, nodeID, ipAddress, internalIP string) (*NodeCapacity, error) {
// Get deployment count
deploymentCount, err := cns.getDeploymentCount(ctx, nodeID)
if err != nil {
return nil, err
}
// Get allocated deployment ports
allocatedPorts, err := cns.getDeploymentPortCount(ctx, nodeID)
if err != nil {
return nil, err
}
// Get resource usage from home_node_assignments
totalMemoryMB, totalCPUPercent, err := cns.getNodeResourceUsage(ctx, nodeID)
if err != nil {
return nil, err
}
// Get namespace instance count
namespaceInstanceCount, err := cns.portAllocator.GetNodeAllocationCount(ctx, nodeID)
if err != nil {
return nil, err
}
// Calculate available capacity
const (
maxDeployments = 100
maxPorts = 9900 // User deployment port range
maxMemoryMB = 8192 // 8GB
maxCPUPercent = 400 // 4 cores
)
availablePorts := maxPorts - allocatedPorts
if availablePorts < 0 {
availablePorts = 0
}
availableMemoryMB := maxMemoryMB - totalMemoryMB
if availableMemoryMB < 0 {
availableMemoryMB = 0
}
availableNamespaceSlots := MaxNamespacesPerNode - namespaceInstanceCount
if availableNamespaceSlots < 0 {
availableNamespaceSlots = 0
}
// Calculate capacity score (0.0 to 1.0, higher is better)
// Extended from home_node.go to include namespace instance count
score := cns.calculateCapacityScore(
deploymentCount, maxDeployments,
allocatedPorts, maxPorts,
totalMemoryMB, maxMemoryMB,
totalCPUPercent, maxCPUPercent,
namespaceInstanceCount, MaxNamespacesPerNode,
)
capacity := &NodeCapacity{
NodeID: nodeID,
IPAddress: ipAddress,
InternalIP: internalIP,
DeploymentCount: deploymentCount,
AllocatedPorts: allocatedPorts,
AvailablePorts: availablePorts,
UsedMemoryMB: totalMemoryMB,
AvailableMemoryMB: availableMemoryMB,
UsedCPUPercent: totalCPUPercent,
NamespaceInstanceCount: namespaceInstanceCount,
AvailableNamespaceSlots: availableNamespaceSlots,
Score: score,
}
return capacity, nil
}
// getDeploymentCount counts active deployments on a node
func (cns *ClusterNodeSelector) getDeploymentCount(ctx context.Context, nodeID string) (int, error) {
type countResult struct {
Count int `db:"count"`
}
var results []countResult
query := `SELECT COUNT(*) as count FROM deployments WHERE home_node_id = ? AND status IN ('active', 'deploying')`
err := cns.db.Query(ctx, &results, query, nodeID)
if err != nil {
return 0, &ClusterError{
Message: "failed to count deployments",
Cause: err,
}
}
if len(results) == 0 {
return 0, nil
}
return results[0].Count, nil
}
// getDeploymentPortCount counts allocated deployment ports on a node
func (cns *ClusterNodeSelector) getDeploymentPortCount(ctx context.Context, nodeID string) (int, error) {
type countResult struct {
Count int `db:"count"`
}
var results []countResult
query := `SELECT COUNT(*) as count FROM port_allocations WHERE node_id = ?`
err := cns.db.Query(ctx, &results, query, nodeID)
if err != nil {
return 0, &ClusterError{
Message: "failed to count allocated ports",
Cause: err,
}
}
if len(results) == 0 {
return 0, nil
}
return results[0].Count, nil
}
// getNodeResourceUsage sums up resource usage for all namespaces on a node
func (cns *ClusterNodeSelector) getNodeResourceUsage(ctx context.Context, nodeID string) (int, int, error) {
type resourceResult struct {
TotalMemoryMB int `db:"total_memory"`
TotalCPUPercent int `db:"total_cpu"`
}
var results []resourceResult
query := `
SELECT
COALESCE(SUM(total_memory_mb), 0) as total_memory,
COALESCE(SUM(total_cpu_percent), 0) as total_cpu
FROM home_node_assignments
WHERE home_node_id = ?
`
err := cns.db.Query(ctx, &results, query, nodeID)
if err != nil {
return 0, 0, &ClusterError{
Message: "failed to query resource usage",
Cause: err,
}
}
if len(results) == 0 {
return 0, 0, nil
}
return results[0].TotalMemoryMB, results[0].TotalCPUPercent, nil
}
// calculateCapacityScore calculates a weighted capacity score (0.0 to 1.0)
// Higher scores indicate more available capacity
func (cns *ClusterNodeSelector) calculateCapacityScore(
deploymentCount, maxDeployments int,
allocatedPorts, maxPorts int,
usedMemoryMB, maxMemoryMB int,
usedCPUPercent, maxCPUPercent int,
namespaceInstances, maxNamespaceInstances int,
) float64 {
// Calculate individual component scores (0.0 to 1.0)
deploymentScore := 1.0 - (float64(deploymentCount) / float64(maxDeployments))
if deploymentScore < 0 {
deploymentScore = 0
}
portScore := 1.0 - (float64(allocatedPorts) / float64(maxPorts))
if portScore < 0 {
portScore = 0
}
memoryScore := 1.0 - (float64(usedMemoryMB) / float64(maxMemoryMB))
if memoryScore < 0 {
memoryScore = 0
}
cpuScore := 1.0 - (float64(usedCPUPercent) / float64(maxCPUPercent))
if cpuScore < 0 {
cpuScore = 0
}
namespaceScore := 1.0 - (float64(namespaceInstances) / float64(maxNamespaceInstances))
if namespaceScore < 0 {
namespaceScore = 0
}
// Weighted average
// Namespace instance count gets significant weight since that's what we're optimizing for
// Weights: deployments 30%, ports 15%, memory 15%, cpu 15%, namespace instances 25%
totalScore := (deploymentScore * 0.30) +
(portScore * 0.15) +
(memoryScore * 0.15) +
(cpuScore * 0.15) +
(namespaceScore * 0.25)
cns.logger.Debug("Calculated capacity score",
zap.Int("deployments", deploymentCount),
zap.Int("allocated_ports", allocatedPorts),
zap.Int("used_memory_mb", usedMemoryMB),
zap.Int("used_cpu_percent", usedCPUPercent),
zap.Int("namespace_instances", namespaceInstances),
zap.Float64("deployment_score", deploymentScore),
zap.Float64("port_score", portScore),
zap.Float64("memory_score", memoryScore),
zap.Float64("cpu_score", cpuScore),
zap.Float64("namespace_score", namespaceScore),
zap.Float64("total_score", totalScore),
)
return totalScore
}
// GetNodeByID retrieves a node's information by ID
func (cns *ClusterNodeSelector) GetNodeByID(ctx context.Context, nodeID string) (*nodeInfo, error) {
internalCtx := client.WithInternalAuth(ctx)
var results []nodeInfo
query := `SELECT id, ip_address, COALESCE(internal_ip, ip_address) as internal_ip FROM dns_nodes WHERE id = ? LIMIT 1`
err := cns.db.Query(internalCtx, &results, query, nodeID)
if err != nil {
return nil, &ClusterError{
Message: "failed to query node",
Cause: err,
}
}
if len(results) == 0 {
return nil, nil
}
return &results[0], nil
}