From 5ec292a4f2df4fef03ba90d0d1c8f6375bb3597d Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Thu, 29 Jan 2026 07:22:32 +0200 Subject: [PATCH] fixed bugs on dns for deployment --- migrations/011_dns_nameservers.sql | 19 ++ pkg/cli/production/install/orchestrator.go | 13 ++ .../production/installers/coredns.go | 159 ++++----------- pkg/node/dns_registration.go | 191 ++++++++++++++++++ pkg/node/node.go | 5 + 5 files changed, 269 insertions(+), 118 deletions(-) create mode 100644 migrations/011_dns_nameservers.sql diff --git a/migrations/011_dns_nameservers.sql b/migrations/011_dns_nameservers.sql new file mode 100644 index 0000000..e2655c0 --- /dev/null +++ b/migrations/011_dns_nameservers.sql @@ -0,0 +1,19 @@ +-- Migration 011: DNS Nameservers Table +-- Maps NS hostnames (ns1, ns2, ns3) to specific node IDs and IPs +-- Provides stable NS assignment that survives restarts and re-seeding + +BEGIN; + +CREATE TABLE IF NOT EXISTS dns_nameservers ( + hostname TEXT PRIMARY KEY, -- e.g., "ns1", "ns2", "ns3" + node_id TEXT NOT NULL, -- Peer ID of the assigned node + ip_address TEXT NOT NULL, -- IP address of the assigned node + domain TEXT NOT NULL, -- Base domain (e.g., "dbrs.space") + assigned_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + UNIQUE(node_id, domain) -- A node can only hold one NS slot per domain +); + +INSERT OR IGNORE INTO schema_migrations(version) VALUES (11); + +COMMIT; diff --git a/pkg/cli/production/install/orchestrator.go b/pkg/cli/production/install/orchestrator.go index 0ccbcff..f0d1132 100644 --- a/pkg/cli/production/install/orchestrator.go +++ b/pkg/cli/production/install/orchestrator.go @@ -6,6 +6,7 @@ import ( "os" "path/filepath" "strings" + "time" "github.com/DeBrosOfficial/network/pkg/cli/utils" "github.com/DeBrosOfficial/network/pkg/environments/production" @@ -167,6 +168,18 @@ func (o *Orchestrator) Execute() error { return fmt.Errorf("service creation failed: %w", err) } + // Seed DNS records after services are running (RQLite must be up) + if o.flags.Nameserver && o.flags.BaseDomain != "" { + fmt.Printf("\n🌐 Phase 6: Seeding DNS records...\n") + fmt.Printf(" Waiting for RQLite to start (10s)...\n") + time.Sleep(10 * time.Second) + if err := o.setup.SeedDNSRecords(o.flags.BaseDomain, o.flags.VpsIP, o.peers); err != nil { + fmt.Fprintf(os.Stderr, " āš ļø Warning: Failed to seed DNS records: %v\n", err) + } else { + fmt.Printf(" āœ“ DNS records seeded\n") + } + } + // Log completion with actual peer ID o.setup.LogSetupComplete(o.setup.NodePeerID) fmt.Printf("āœ… Production installation complete!\n\n") diff --git a/pkg/environments/production/installers/coredns.go b/pkg/environments/production/installers/coredns.go index 10116d2..e203d68 100644 --- a/pkg/environments/production/installers/coredns.go +++ b/pkg/environments/production/installers/coredns.go @@ -6,7 +6,6 @@ import ( "fmt" "io" "net/http" - "net/url" "os" "os/exec" "path/filepath" @@ -313,137 +312,61 @@ func (ci *CoreDNSInstaller) generateCorefile(domain, rqliteDSN string) string { `, domain, domain, rqliteDSN) } -// seedStaticRecords inserts static zone records into RQLite +// seedStaticRecords inserts static zone records into RQLite (non-destructive) +// Each node only adds its own IP to the round-robin. SOA and NS records are upserted idempotently. func (ci *CoreDNSInstaller) seedStaticRecords(domain, rqliteDSN, ns1IP, ns2IP, ns3IP string) error { - // First, check if nameserver A records already exist with different IPs - // If so, we should preserve them instead of overwriting with potentially wrong IPs - existingNSIPs, err := ci.getExistingNameserverIPs(domain, rqliteDSN) - if err == nil && len(existingNSIPs) == 3 { - // Check if they have at least 2 different IPs (properly configured cluster) - uniqueIPs := make(map[string]bool) - for _, ip := range existingNSIPs { - uniqueIPs[ip] = true - } - if len(uniqueIPs) >= 2 { - // Nameserver records are already properly configured, use existing IPs - fmt.Fprintf(ci.logWriter, " Using existing nameserver IPs from database\n") - ns1IP = existingNSIPs[0] - ns2IP = existingNSIPs[1] - ns3IP = existingNSIPs[2] - } - } - // Generate serial based on current date serial := fmt.Sprintf("%d", time.Now().Unix()) // SOA record format: "mname rname serial refresh retry expire minimum" soaValue := fmt.Sprintf("ns1.%s. admin.%s. %s 3600 1800 604800 300", domain, domain, serial) - // First, delete existing system records to avoid duplicates - // We only delete system records, not deployment-created records - deleteStatements := []string{ - fmt.Sprintf(`DELETE FROM dns_records WHERE namespace = 'system' AND fqdn = '%s.' AND record_type IN ('SOA', 'NS', 'A')`, domain), - fmt.Sprintf(`DELETE FROM dns_records WHERE namespace = 'system' AND fqdn = '*.%s.' AND record_type = 'A'`, domain), - fmt.Sprintf(`DELETE FROM dns_records WHERE namespace = 'system' AND fqdn LIKE 'ns%%.%s.' AND record_type = 'A'`, domain), - } - - if err := ci.executeRQLiteStatements(rqliteDSN, deleteStatements); err != nil { - return fmt.Errorf("failed to clean up old records: %w", err) - } - - // Define all static records - records := []struct { - fqdn string - recordType string - value string - ttl int - }{ - // SOA record - {domain + ".", "SOA", soaValue, 300}, - - // NS records - {domain + ".", "NS", "ns1." + domain + ".", 300}, - {domain + ".", "NS", "ns2." + domain + ".", 300}, - {domain + ".", "NS", "ns3." + domain + ".", 300}, - - // Nameserver A records (glue) - {"ns1." + domain + ".", "A", ns1IP, 300}, - {"ns2." + domain + ".", "A", ns2IP, 300}, - {"ns3." + domain + ".", "A", ns3IP, 300}, - - // Root domain A records (round-robin) - {domain + ".", "A", ns1IP, 300}, - {domain + ".", "A", ns2IP, 300}, - {domain + ".", "A", ns3IP, 300}, - - // Wildcard A records (round-robin) - {"*." + domain + ".", "A", ns1IP, 300}, - {"*." + domain + ".", "A", ns2IP, 300}, - {"*." + domain + ".", "A", ns3IP, 300}, - } - - // Build SQL statements var statements []string - for _, r := range records { - // IMPORTANT: Must set is_active = TRUE for CoreDNS to find the records - stmt := fmt.Sprintf( - `INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) VALUES ('%s', '%s', '%s', %d, 'system', 'system', TRUE, datetime('now'), datetime('now'))`, - r.fqdn, r.recordType, r.value, r.ttl, - ) - statements = append(statements, stmt) + + // SOA record — delete old and insert new (serial changes each time, so value differs) + statements = append(statements, fmt.Sprintf( + `DELETE FROM dns_records WHERE fqdn = '%s.' AND record_type = 'SOA' AND namespace = 'system'`, + domain, + )) + statements = append(statements, fmt.Sprintf( + `INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) VALUES ('%s.', 'SOA', '%s', 300, 'system', 'system', TRUE, datetime('now'), datetime('now'))`, + domain, soaValue, + )) + + // NS records — idempotent insert + for i := 1; i <= 3; i++ { + statements = append(statements, fmt.Sprintf( + `INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) VALUES ('%s.', 'NS', 'ns%d.%s.', 300, 'system', 'system', TRUE, datetime('now'), datetime('now')) ON CONFLICT(fqdn, record_type, value) DO NOTHING`, + domain, i, domain, + )) + } + + // NOTE: Nameserver glue A records (ns1/ns2/ns3) are NOT seeded here. + // They are managed by each node's claimNameserverSlot() on the heartbeat loop, + // which correctly maps each NS hostname to exactly one node's IP. + + // Round-robin A records — each unique IP is added once (no duplicates due to UNIQUE constraint) + uniqueIPs := make(map[string]bool) + for _, ip := range []string{ns1IP, ns2IP, ns3IP} { + if !uniqueIPs[ip] { + uniqueIPs[ip] = true + // Root domain A record + statements = append(statements, fmt.Sprintf( + `INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) VALUES ('%s.', 'A', '%s', 300, 'system', 'system', TRUE, datetime('now'), datetime('now')) ON CONFLICT(fqdn, record_type, value) DO NOTHING`, + domain, ip, + )) + // Wildcard A record + statements = append(statements, fmt.Sprintf( + `INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) VALUES ('*.%s.', 'A', '%s', 300, 'system', 'system', TRUE, datetime('now'), datetime('now')) ON CONFLICT(fqdn, record_type, value) DO NOTHING`, + domain, ip, + )) + } } // Execute via RQLite HTTP API return ci.executeRQLiteStatements(rqliteDSN, statements) } -// getExistingNameserverIPs queries RQLite for existing ns1, ns2, ns3 A record IPs -func (ci *CoreDNSInstaller) getExistingNameserverIPs(domain, rqliteDSN string) ([]string, error) { - // Build query - use url.QueryEscape to properly encode the SQL - query := fmt.Sprintf("SELECT fqdn, value FROM dns_records WHERE fqdn LIKE 'ns_.%s.' AND record_type = 'A' AND is_active = TRUE ORDER BY fqdn", domain) - queryURL := fmt.Sprintf("%s/db/query?q=%s", rqliteDSN, url.QueryEscape(query)) - - client := &http.Client{Timeout: 5 * time.Second} - resp, err := client.Get(queryURL) - if err != nil { - return nil, err - } - defer resp.Body.Close() - - if resp.StatusCode != http.StatusOK { - return nil, fmt.Errorf("query failed with status %d", resp.StatusCode) - } - - var result struct { - Results []struct { - Values [][]interface{} `json:"values"` - } `json:"results"` - } - - if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { - return nil, err - } - - if len(result.Results) == 0 || result.Results[0].Values == nil || len(result.Results[0].Values) < 3 { - return nil, fmt.Errorf("not enough nameserver records found") - } - - // Extract IPs for ns1, ns2, ns3 (ordered by fqdn) - ips := make([]string, 0, 3) - for _, row := range result.Results[0].Values { - if len(row) >= 2 { - if ip, ok := row[1].(string); ok { - ips = append(ips, ip) - } - } - } - - if len(ips) != 3 { - return nil, fmt.Errorf("expected 3 nameserver IPs, got %d", len(ips)) - } - - return ips, nil -} // rqliteResult represents the response from RQLite execute endpoint type rqliteResult struct { diff --git a/pkg/node/dns_registration.go b/pkg/node/dns_registration.go index c3645e1..28555d8 100644 --- a/pkg/node/dns_registration.go +++ b/pkg/node/dns_registration.go @@ -78,6 +78,12 @@ func (n *Node) startDNSHeartbeat(ctx context.Context) { if err := n.updateDNSHeartbeat(ctx); err != nil { n.logger.ComponentWarn(logging.ComponentNode, "Failed to update DNS heartbeat", zap.Error(err)) } + // Self-healing: ensure this node's DNS records exist on every heartbeat + if err := n.ensureBaseDNSRecords(ctx); err != nil { + n.logger.ComponentWarn(logging.ComponentNode, "Failed to ensure DNS records on heartbeat", zap.Error(err)) + } + // Remove DNS records for nodes that stopped heartbeating + n.cleanupStaleNodeRecords(ctx) } } }() @@ -106,6 +112,191 @@ func (n *Node) updateDNSHeartbeat(ctx context.Context) error { return nil } +// ensureBaseDNSRecords ensures this node's IP is present in the base DNS records. +// This provides self-healing: if records are missing (fresh install, DB reset), +// the node recreates them on startup. Each node only manages its own IP entries. +func (n *Node) ensureBaseDNSRecords(ctx context.Context) error { + domain := n.config.Node.Domain + if domain == "" { + domain = n.config.HTTPGateway.BaseDomain + } + if domain == "" { + return nil // No domain configured, skip + } + + ipAddress, err := n.getNodeIPAddress() + if err != nil { + return fmt.Errorf("failed to determine node IP: %w", err) + } + + // Ensure trailing dot for FQDN format (as CoreDNS expects) + fqdn := domain + "." + wildcardFQDN := "*." + domain + "." + + db := n.rqliteAdapter.GetSQLDB() + + // Insert root A record and wildcard A record for this node's IP + // ON CONFLICT DO NOTHING avoids duplicates (UNIQUE on fqdn, record_type, value) + records := []struct { + fqdn string + value string + }{ + {fqdn, ipAddress}, + {wildcardFQDN, ipAddress}, + } + + for _, r := range records { + query := `INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) + VALUES (?, 'A', ?, 300, 'system', 'system', TRUE, datetime('now'), datetime('now')) + ON CONFLICT(fqdn, record_type, value) DO NOTHING` + if _, err := db.ExecContext(ctx, query, r.fqdn, r.value); err != nil { + n.logger.ComponentWarn(logging.ComponentNode, "Failed to ensure DNS record", + zap.String("fqdn", r.fqdn), zap.Error(err)) + } + } + + // Claim an NS slot if available (ns1, ns2, or ns3) + n.claimNameserverSlot(ctx, domain, ipAddress) + + return nil +} + +// claimNameserverSlot attempts to claim an available NS hostname (ns1/ns2/ns3) for this node. +// If the node already has a slot, it updates the IP. If no slot is available, it does nothing. +func (n *Node) claimNameserverSlot(ctx context.Context, domain, ipAddress string) { + nodeID := n.GetPeerID() + db := n.rqliteAdapter.GetSQLDB() + + // Check if this node already has a slot + var existingHostname string + err := db.QueryRowContext(ctx, + `SELECT hostname FROM dns_nameservers WHERE node_id = ? AND domain = ?`, + nodeID, domain, + ).Scan(&existingHostname) + + if err == nil { + // Already claimed — update IP if changed + if _, err := db.ExecContext(ctx, + `UPDATE dns_nameservers SET ip_address = ?, updated_at = datetime('now') WHERE hostname = ? AND domain = ?`, + ipAddress, existingHostname, domain, + ); err != nil { + n.logger.ComponentWarn(logging.ComponentNode, "Failed to update NS slot IP", zap.Error(err)) + } + // Ensure the glue A record matches + nsFQDN := existingHostname + "." + domain + "." + if _, err := db.ExecContext(ctx, + `INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) + VALUES (?, 'A', ?, 300, 'system', 'system', TRUE, datetime('now'), datetime('now')) + ON CONFLICT(fqdn, record_type, value) DO NOTHING`, + nsFQDN, ipAddress, + ); err != nil { + n.logger.ComponentWarn(logging.ComponentNode, "Failed to ensure NS glue record", zap.Error(err)) + } + return + } + + // Try to claim an available slot + for _, hostname := range []string{"ns1", "ns2", "ns3"} { + result, err := db.ExecContext(ctx, + `INSERT INTO dns_nameservers (hostname, node_id, ip_address, domain) VALUES (?, ?, ?, ?) + ON CONFLICT(hostname) DO NOTHING`, + hostname, nodeID, ipAddress, domain, + ) + if err != nil { + continue + } + rows, _ := result.RowsAffected() + if rows > 0 { + // Successfully claimed this slot — create glue record + nsFQDN := hostname + "." + domain + "." + if _, err := db.ExecContext(ctx, + `INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) + VALUES (?, 'A', ?, 300, 'system', 'system', TRUE, datetime('now'), datetime('now')) + ON CONFLICT(fqdn, record_type, value) DO NOTHING`, + nsFQDN, ipAddress, + ); err != nil { + n.logger.ComponentWarn(logging.ComponentNode, "Failed to create NS glue record", zap.Error(err)) + } + n.logger.ComponentInfo(logging.ComponentNode, "Claimed NS slot", + zap.String("hostname", hostname), + zap.String("ip", ipAddress), + ) + return + } + } +} + +// cleanupStaleNodeRecords removes A records for nodes that have stopped heartbeating. +// This ensures DNS only returns IPs for healthy, active nodes. +func (n *Node) cleanupStaleNodeRecords(ctx context.Context) { + if n.rqliteAdapter == nil { + return + } + + domain := n.config.Node.Domain + if domain == "" { + domain = n.config.HTTPGateway.BaseDomain + } + if domain == "" { + return + } + + db := n.rqliteAdapter.GetSQLDB() + + // Find nodes that haven't sent a heartbeat in over 2 minutes + staleQuery := `SELECT id, ip_address FROM dns_nodes WHERE status = 'active' AND last_seen < datetime('now', '-120 seconds')` + rows, err := db.QueryContext(ctx, staleQuery) + if err != nil { + n.logger.ComponentWarn(logging.ComponentNode, "Failed to query stale nodes", zap.Error(err)) + return + } + defer rows.Close() + + fqdn := domain + "." + wildcardFQDN := "*." + domain + "." + + for rows.Next() { + var nodeID, ip string + if err := rows.Scan(&nodeID, &ip); err != nil { + continue + } + + // Mark node as inactive + if _, err := db.ExecContext(ctx, `UPDATE dns_nodes SET status = 'inactive', updated_at = datetime('now') WHERE id = ?`, nodeID); err != nil { + n.logger.ComponentWarn(logging.ComponentNode, "Failed to mark node inactive", zap.String("node_id", nodeID), zap.Error(err)) + } + + // Remove the dead node's A records from round-robin + for _, f := range []string{fqdn, wildcardFQDN} { + if _, err := db.ExecContext(ctx, `DELETE FROM dns_records WHERE fqdn = ? AND record_type = 'A' AND value = ? AND namespace = 'system'`, f, ip); err != nil { + n.logger.ComponentWarn(logging.ComponentNode, "Failed to remove stale DNS record", + zap.String("fqdn", f), zap.String("ip", ip), zap.Error(err)) + } + } + + // Release any NS slot held by this dead node + if _, err := db.ExecContext(ctx, `DELETE FROM dns_nameservers WHERE node_id = ?`, nodeID); err != nil { + n.logger.ComponentWarn(logging.ComponentNode, "Failed to release NS slot", zap.String("node_id", nodeID), zap.Error(err)) + } + + // Remove glue records for this node's IP (ns1.domain., ns2.domain., ns3.domain.) + for _, ns := range []string{"ns1", "ns2", "ns3"} { + nsFQDN := ns + "." + domain + "." + if _, err := db.ExecContext(ctx, + `DELETE FROM dns_records WHERE fqdn = ? AND record_type = 'A' AND value = ? AND namespace = 'system'`, + nsFQDN, ip, + ); err != nil { + n.logger.ComponentWarn(logging.ComponentNode, "Failed to remove NS glue record", zap.Error(err)) + } + } + + n.logger.ComponentInfo(logging.ComponentNode, "Removed stale node from DNS", + zap.String("node_id", nodeID), + zap.String("ip", ip), + ) + } +} + // getNodeIPAddress attempts to determine the node's external IP address func (n *Node) getNodeIPAddress() (string, error) { // Try to detect external IP by connecting to a public server diff --git a/pkg/node/node.go b/pkg/node/node.go index 62d069d..3daf968 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -110,6 +110,11 @@ func (n *Node) Start(ctx context.Context) error { } else { // Start DNS heartbeat to keep node status fresh n.startDNSHeartbeat(ctx) + + // Ensure base DNS records exist for this node (self-healing) + if err := n.ensureBaseDNSRecords(ctx); err != nil { + n.logger.ComponentWarn(logging.ComponentNode, "Failed to ensure base DNS records", zap.Error(err)) + } } // Get listen addresses for logging