From 7b12dde469144b2d70c28ece1f4e544aa14e2224 Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Thu, 29 Jan 2026 13:07:05 +0200 Subject: [PATCH] Fixed dns failover middleware --- pkg/gateway/handlers/deployments/service.go | 67 +++++++++++++++------ pkg/gateway/middleware.go | 21 ++++++- 2 files changed, 66 insertions(+), 22 deletions(-) diff --git a/pkg/gateway/handlers/deployments/service.go b/pkg/gateway/handlers/deployments/service.go index 6c24147..9c071fe 100644 --- a/pkg/gateway/handlers/deployments/service.go +++ b/pkg/gateway/handlers/deployments/service.go @@ -366,6 +366,22 @@ func (s *DeploymentService) setupDynamicReplica(ctx context.Context, deployment zap.String("deployment_id", deployment.ID), zap.String("node_id", nodeID), ) + + // Create DNS record for the replica node (after successful setup) + dnsName := deployment.Subdomain + if dnsName == "" { + dnsName = deployment.Name + } + fqdn := fmt.Sprintf("%s.%s.", dnsName, s.BaseDomain()) + if err := s.createDNSRecord(ctx, fqdn, "A", nodeIP, deployment.Namespace, deployment.ID); err != nil { + s.logger.Error("Failed to create DNS record for replica", zap.String("node_id", nodeID), zap.Error(err)) + } else { + s.logger.Info("Created DNS record for replica", + zap.String("fqdn", fqdn), + zap.String("ip", nodeIP), + zap.String("node_id", nodeID), + ) + } } // callInternalAPI makes an HTTP POST to a node's internal API. @@ -559,33 +575,44 @@ func (s *DeploymentService) UpdateDeploymentStatus(ctx context.Context, deployme return nil } -// CreateDNSRecords creates DNS records for a deployment +// CreateDNSRecords creates DNS records for a deployment. +// Creates A records for the home node and all replica nodes for round-robin DNS. func (s *DeploymentService) CreateDNSRecords(ctx context.Context, deployment *deployments.Deployment) error { - // Get node IP using the full node ID - nodeIP, err := s.getNodeIP(ctx, deployment.HomeNodeID) - if err != nil { - s.logger.Error("Failed to get node IP", zap.Error(err)) - return err - } - // Use subdomain if set, otherwise fall back to name - // New format: {name}-{random}.{baseDomain} (e.g., myapp-f3o4if.dbrs.space) dnsName := deployment.Subdomain if dnsName == "" { dnsName = deployment.Name } - - // Create deployment record: {subdomain}.{baseDomain} - // Any node can receive the request and proxy to the home node if needed fqdn := fmt.Sprintf("%s.%s.", dnsName, s.BaseDomain()) - if err := s.createDNSRecord(ctx, fqdn, "A", nodeIP, deployment.Namespace, deployment.ID); err != nil { - s.logger.Error("Failed to create DNS record", zap.Error(err)) - } else { - s.logger.Info("Created DNS record", - zap.String("fqdn", fqdn), - zap.String("ip", nodeIP), - zap.String("subdomain", dnsName), - ) + + // Collect all node IDs that should have DNS records (home node + replicas) + nodeIDs := []string{deployment.HomeNodeID} + if s.replicaManager != nil { + replicaNodes, err := s.replicaManager.GetActiveReplicaNodes(ctx, deployment.ID) + if err == nil { + for _, nodeID := range replicaNodes { + if nodeID != deployment.HomeNodeID { + nodeIDs = append(nodeIDs, nodeID) + } + } + } + } + + for _, nodeID := range nodeIDs { + nodeIP, err := s.getNodeIP(ctx, nodeID) + if err != nil { + s.logger.Error("Failed to get node IP for DNS record", zap.String("node_id", nodeID), zap.Error(err)) + continue + } + if err := s.createDNSRecord(ctx, fqdn, "A", nodeIP, deployment.Namespace, deployment.ID); err != nil { + s.logger.Error("Failed to create DNS record", zap.String("node_id", nodeID), zap.Error(err)) + } else { + s.logger.Info("Created DNS record", + zap.String("fqdn", fqdn), + zap.String("ip", nodeIP), + zap.String("node_id", nodeID), + ) + } } return nil diff --git a/pkg/gateway/middleware.go b/pkg/gateway/middleware.go index 10d6e3b..3d3b8ba 100644 --- a/pkg/gateway/middleware.go +++ b/pkg/gateway/middleware.go @@ -874,10 +874,18 @@ serveLocal: httpClient := &http.Client{Timeout: 30 * time.Second} resp, err := httpClient.Do(proxyReq) if err != nil { - g.logger.ComponentError(logging.ComponentGeneral, "proxy request failed", + g.logger.ComponentError(logging.ComponentGeneral, "local proxy request failed", zap.String("target", target), zap.Error(err), ) + + // Local process is down — try other replica nodes before giving up + if g.replicaManager != nil { + if g.proxyCrossNodeWithReplicas(w, r, deployment) { + return + } + } + http.Error(w, "Service unavailable", http.StatusServiceUnavailable) return } @@ -1049,7 +1057,7 @@ func (g *Gateway) proxyCrossNodeToIP(w http.ResponseWriter, r *http.Request, dep proxyReq.Header.Set("X-Forwarded-For", getClientIP(r)) proxyReq.Header.Set("X-Orama-Proxy-Node", g.nodePeerID) - httpClient := &http.Client{Timeout: 120 * time.Second} + httpClient := &http.Client{Timeout: 5 * time.Second} resp, err := httpClient.Do(proxyReq) if err != nil { g.logger.Warn("Replica proxy request failed", @@ -1060,6 +1068,15 @@ func (g *Gateway) proxyCrossNodeToIP(w http.ResponseWriter, r *http.Request, dep } defer resp.Body.Close() + // If the remote node returned a gateway error, try the next replica + if resp.StatusCode == http.StatusBadGateway || resp.StatusCode == http.StatusServiceUnavailable || resp.StatusCode == http.StatusGatewayTimeout { + g.logger.Warn("Replica returned gateway error, trying next", + zap.String("target_ip", nodeIP), + zap.Int("status", resp.StatusCode), + ) + return false + } + for key, values := range resp.Header { for _, value := range values { w.Header().Add(key, value)