package node import ( "context" "database/sql" "fmt" "net" "os" "path/filepath" "strings" "time" "github.com/DeBrosOfficial/network/pkg/logging" "go.uber.org/zap" ) // registerDNSNode registers this node in the dns_nodes table for deployment routing func (n *Node) registerDNSNode(ctx context.Context) error { if n.rqliteAdapter == nil { return fmt.Errorf("rqlite adapter not initialized") } // Get node ID (use peer ID) nodeID := n.GetPeerID() if nodeID == "" { return fmt.Errorf("node peer ID not available") } // Get external IP address ipAddress, err := n.getNodeIPAddress() if err != nil { n.logger.ComponentWarn(logging.ComponentNode, "Failed to determine node IP, using localhost", zap.Error(err)) ipAddress = "127.0.0.1" } // Get internal IP from WireGuard interface (for cross-node communication over VPN) internalIP := ipAddress if wgIP, err := n.getWireGuardIP(); err == nil && wgIP != "" { internalIP = wgIP } // Determine region (defaulting to "local" for now, could be from cloud metadata in future) region := "local" // Insert or update node record query := ` INSERT INTO dns_nodes (id, ip_address, internal_ip, region, status, last_seen, created_at, updated_at) VALUES (?, ?, ?, ?, 'active', datetime('now'), datetime('now'), datetime('now')) ON CONFLICT(id) DO UPDATE SET ip_address = excluded.ip_address, internal_ip = excluded.internal_ip, region = excluded.region, status = 'active', last_seen = datetime('now'), updated_at = datetime('now') ` db := n.rqliteAdapter.GetSQLDB() _, err = db.ExecContext(ctx, query, nodeID, ipAddress, internalIP, region) if err != nil { return fmt.Errorf("failed to register DNS node: %w", err) } n.logger.ComponentInfo(logging.ComponentNode, "Registered DNS node", zap.String("node_id", nodeID), zap.String("ip_address", ipAddress), zap.String("region", region), ) return nil } // startDNSHeartbeat starts a goroutine that periodically updates the node's last_seen timestamp func (n *Node) startDNSHeartbeat(ctx context.Context) { go func() { ticker := time.NewTicker(30 * time.Second) defer ticker.Stop() for { select { case <-ctx.Done(): n.logger.ComponentInfo(logging.ComponentNode, "DNS heartbeat stopped") return case <-ticker.C: if err := n.updateDNSHeartbeat(ctx); err != nil { n.logger.ComponentWarn(logging.ComponentNode, "Failed to update DNS heartbeat", zap.Error(err)) } // Self-healing: ensure this node's DNS records exist on every heartbeat if err := n.ensureBaseDNSRecords(ctx); err != nil { n.logger.ComponentWarn(logging.ComponentNode, "Failed to ensure DNS records on heartbeat", zap.Error(err)) } // Remove DNS records for nodes that stopped heartbeating n.cleanupStaleNodeRecords(ctx) } } }() n.logger.ComponentInfo(logging.ComponentNode, "Started DNS heartbeat (30s interval)") } // updateDNSHeartbeat updates the node's last_seen timestamp in dns_nodes func (n *Node) updateDNSHeartbeat(ctx context.Context) error { if n.rqliteAdapter == nil { return fmt.Errorf("rqlite adapter not initialized") } nodeID := n.GetPeerID() if nodeID == "" { return fmt.Errorf("node peer ID not available") } query := `UPDATE dns_nodes SET last_seen = datetime('now'), updated_at = datetime('now') WHERE id = ?` db := n.rqliteAdapter.GetSQLDB() _, err := db.ExecContext(ctx, query, nodeID) if err != nil { return fmt.Errorf("failed to update DNS heartbeat: %w", err) } return nil } // ensureBaseDNSRecords ensures this node's IP is present in the base DNS records. // This provides self-healing: if records are missing (fresh install, DB reset), // the node recreates them on startup. Each node only manages its own IP entries. // // Records are created for BOTH the base domain (dbrs.space) and the node domain // (node1.dbrs.space). The base domain records enable round-robin load balancing // across all nodes. The node domain records enable direct node access. func (n *Node) ensureBaseDNSRecords(ctx context.Context) error { baseDomain := n.config.HTTPGateway.BaseDomain nodeDomain := n.config.Node.Domain if baseDomain == "" && nodeDomain == "" { return nil // No domain configured, skip } ipAddress, err := n.getNodeIPAddress() if err != nil { return fmt.Errorf("failed to determine node IP: %w", err) } db := n.rqliteAdapter.GetSQLDB() // Clean up any private IP A records left by old code versions. // Old code could insert WireGuard IPs (10.0.0.x) into dns_records. // This self-heals on every heartbeat cycle. cleanupPrivateIPRecords(ctx, db, n.logger) // Build list of A records to ensure var records []struct { fqdn string value string } // Base domain records (e.g., dbrs.space, *.dbrs.space) — only for nameserver nodes. // Only nameserver nodes run Caddy (HTTPS), so only they should appear in base domain // round-robin. Non-nameserver nodes would cause TLS failures for clients. if baseDomain != "" && n.isNameserverNode(ctx) { records = append(records, struct{ fqdn, value string }{baseDomain + ".", ipAddress}, struct{ fqdn, value string }{"*." + baseDomain + ".", ipAddress}, ) } // Node-specific records (e.g., node1.dbrs.space, *.node1.dbrs.space) — for direct node access if nodeDomain != "" && nodeDomain != baseDomain { records = append(records, struct{ fqdn, value string }{nodeDomain + ".", ipAddress}, struct{ fqdn, value string }{"*." + nodeDomain + ".", ipAddress}, ) } // Insert root A record and wildcard A record for this node's IP // ON CONFLICT DO NOTHING avoids duplicates (UNIQUE on fqdn, record_type, value) for _, r := range records { query := `INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) VALUES (?, 'A', ?, 300, 'system', 'system', TRUE, datetime('now'), datetime('now')) ON CONFLICT(fqdn, record_type, value) DO NOTHING` if _, err := db.ExecContext(ctx, query, r.fqdn, r.value); err != nil { n.logger.ComponentWarn(logging.ComponentNode, "Failed to ensure DNS record", zap.String("fqdn", r.fqdn), zap.Error(err)) } } // Ensure SOA and NS records exist for the base domain (self-healing) if baseDomain != "" { n.ensureSOAAndNSRecords(ctx, baseDomain) } // Claim an NS slot for the base domain (ns1/ns2/ns3) — only if this node // was installed with --nameserver (i.e. runs Caddy + CoreDNS). if baseDomain != "" && n.isNameserverPreference() { n.claimNameserverSlot(ctx, baseDomain, ipAddress) } return nil } // ensureSOAAndNSRecords creates SOA and NS records for the base domain if they don't exist. // These are normally seeded during install Phase 7, but if that fails (e.g. migrations // not yet run), the heartbeat self-heals them here. func (n *Node) ensureSOAAndNSRecords(ctx context.Context, baseDomain string) { db := n.rqliteAdapter.GetSQLDB() fqdn := baseDomain + "." // Check if SOA exists var count int err := db.QueryRowContext(ctx, `SELECT COUNT(*) FROM dns_records WHERE fqdn = ? AND record_type = 'SOA'`, fqdn, ).Scan(&count) if err != nil || count > 0 { return // SOA exists or query failed, skip } n.logger.ComponentInfo(logging.ComponentNode, "SOA/NS records missing, self-healing", zap.String("domain", baseDomain)) // Create SOA record soaValue := fmt.Sprintf("ns1.%s. admin.%s. %d 3600 1800 604800 300", baseDomain, baseDomain, time.Now().Unix()) if _, err := db.ExecContext(ctx, `INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) VALUES (?, 'SOA', ?, 300, 'system', 'system', TRUE, datetime('now'), datetime('now')) ON CONFLICT(fqdn, record_type, value) DO NOTHING`, fqdn, soaValue, ); err != nil { n.logger.ComponentWarn(logging.ComponentNode, "Failed to create SOA record", zap.Error(err)) } // Create NS records (ns1, ns2, ns3) for i := 1; i <= 3; i++ { nsValue := fmt.Sprintf("ns%d.%s.", i, baseDomain) if _, err := db.ExecContext(ctx, `INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) VALUES (?, 'NS', ?, 300, 'system', 'system', TRUE, datetime('now'), datetime('now')) ON CONFLICT(fqdn, record_type, value) DO NOTHING`, fqdn, nsValue, ); err != nil { n.logger.ComponentWarn(logging.ComponentNode, "Failed to create NS record", zap.Error(err)) } } } // claimNameserverSlot attempts to claim an available NS hostname (ns1/ns2/ns3) for this node. // If the node already has a slot, it updates the IP. If no slot is available, it does nothing. func (n *Node) claimNameserverSlot(ctx context.Context, domain, ipAddress string) { nodeID := n.GetPeerID() db := n.rqliteAdapter.GetSQLDB() // Check if this node already has a slot var existingHostname string err := db.QueryRowContext(ctx, `SELECT hostname FROM dns_nameservers WHERE node_id = ? AND domain = ?`, nodeID, domain, ).Scan(&existingHostname) if err == nil { // Already claimed — update IP if changed if _, err := db.ExecContext(ctx, `UPDATE dns_nameservers SET ip_address = ?, updated_at = datetime('now') WHERE hostname = ? AND domain = ?`, ipAddress, existingHostname, domain, ); err != nil { n.logger.ComponentWarn(logging.ComponentNode, "Failed to update NS slot IP", zap.Error(err)) } // Ensure the glue A record matches nsFQDN := existingHostname + "." + domain + "." if _, err := db.ExecContext(ctx, `INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) VALUES (?, 'A', ?, 300, 'system', 'system', TRUE, datetime('now'), datetime('now')) ON CONFLICT(fqdn, record_type, value) DO NOTHING`, nsFQDN, ipAddress, ); err != nil { n.logger.ComponentWarn(logging.ComponentNode, "Failed to ensure NS glue record", zap.Error(err)) } return } // Try to claim an available slot for _, hostname := range []string{"ns1", "ns2", "ns3"} { result, err := db.ExecContext(ctx, `INSERT INTO dns_nameservers (hostname, node_id, ip_address, domain) VALUES (?, ?, ?, ?) ON CONFLICT(hostname) DO NOTHING`, hostname, nodeID, ipAddress, domain, ) if err != nil { continue } rows, _ := result.RowsAffected() if rows > 0 { // Successfully claimed this slot — create glue record nsFQDN := hostname + "." + domain + "." if _, err := db.ExecContext(ctx, `INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) VALUES (?, 'A', ?, 300, 'system', 'system', TRUE, datetime('now'), datetime('now')) ON CONFLICT(fqdn, record_type, value) DO NOTHING`, nsFQDN, ipAddress, ); err != nil { n.logger.ComponentWarn(logging.ComponentNode, "Failed to create NS glue record", zap.Error(err)) } n.logger.ComponentInfo(logging.ComponentNode, "Claimed NS slot", zap.String("hostname", hostname), zap.String("ip", ipAddress), ) return } } } // cleanupStaleNodeRecords removes A records for nodes that have stopped heartbeating. // This ensures DNS only returns IPs for healthy, active nodes. func (n *Node) cleanupStaleNodeRecords(ctx context.Context) { if n.rqliteAdapter == nil { return } baseDomain := n.config.HTTPGateway.BaseDomain if baseDomain == "" { baseDomain = n.config.Node.Domain } if baseDomain == "" { return } db := n.rqliteAdapter.GetSQLDB() // Find nodes that haven't sent a heartbeat in over 2 minutes staleQuery := `SELECT id, ip_address FROM dns_nodes WHERE status = 'active' AND last_seen < datetime('now', '-120 seconds')` rows, err := db.QueryContext(ctx, staleQuery) if err != nil { n.logger.ComponentWarn(logging.ComponentNode, "Failed to query stale nodes", zap.Error(err)) return } defer rows.Close() // Build all FQDNs to clean: base domain + node domain var fqdnsToClean []string fqdnsToClean = append(fqdnsToClean, baseDomain+".", "*."+baseDomain+".") if n.config.Node.Domain != "" && n.config.Node.Domain != baseDomain { fqdnsToClean = append(fqdnsToClean, n.config.Node.Domain+".", "*."+n.config.Node.Domain+".") } for rows.Next() { var nodeID, ip string if err := rows.Scan(&nodeID, &ip); err != nil { continue } // Mark node as inactive if _, err := db.ExecContext(ctx, `UPDATE dns_nodes SET status = 'inactive', updated_at = datetime('now') WHERE id = ?`, nodeID); err != nil { n.logger.ComponentWarn(logging.ComponentNode, "Failed to mark node inactive", zap.String("node_id", nodeID), zap.Error(err)) } // Remove the dead node's A records from round-robin for _, f := range fqdnsToClean { if _, err := db.ExecContext(ctx, `DELETE FROM dns_records WHERE fqdn = ? AND record_type = 'A' AND value = ? AND namespace = 'system'`, f, ip); err != nil { n.logger.ComponentWarn(logging.ComponentNode, "Failed to remove stale DNS record", zap.String("fqdn", f), zap.String("ip", ip), zap.Error(err)) } } // Release any NS slot held by this dead node if _, err := db.ExecContext(ctx, `DELETE FROM dns_nameservers WHERE node_id = ?`, nodeID); err != nil { n.logger.ComponentWarn(logging.ComponentNode, "Failed to release NS slot", zap.String("node_id", nodeID), zap.Error(err)) } // Remove glue records for this node's IP (ns1.domain., ns2.domain., ns3.domain.) for _, ns := range []string{"ns1", "ns2", "ns3"} { nsFQDN := ns + "." + baseDomain + "." if _, err := db.ExecContext(ctx, `DELETE FROM dns_records WHERE fqdn = ? AND record_type = 'A' AND value = ? AND namespace = 'system'`, nsFQDN, ip, ); err != nil { n.logger.ComponentWarn(logging.ComponentNode, "Failed to remove NS glue record", zap.Error(err)) } } n.logger.ComponentInfo(logging.ComponentNode, "Removed stale node from DNS", zap.String("node_id", nodeID), zap.String("ip", ip), ) } } // isNameserverPreference checks if this node was installed with --nameserver flag // by reading the preferences.yaml file. Only nameserver nodes should claim NS slots. func (n *Node) isNameserverPreference() bool { oramaDir := filepath.Join(os.ExpandEnv(n.config.Node.DataDir), "..") prefsPath := filepath.Join(oramaDir, "preferences.yaml") data, err := os.ReadFile(prefsPath) if err != nil { return false } // Simple check: look for "nameserver: true" in the YAML return strings.Contains(string(data), "nameserver: true") } // isNameserverNode checks if this node has claimed a nameserver slot (ns1/ns2/ns3). // Only nameserver nodes run Caddy for HTTPS, so only they should be in base domain DNS. func (n *Node) isNameserverNode(ctx context.Context) bool { if n.rqliteAdapter == nil { return false } nodeID := n.GetPeerID() if nodeID == "" { return false } db := n.rqliteAdapter.GetSQLDB() var count int err := db.QueryRowContext(ctx, `SELECT COUNT(*) FROM dns_nameservers WHERE node_id = ?`, nodeID, ).Scan(&count) return err == nil && count > 0 } // getWireGuardIP returns the IPv4 address assigned to the wg0 interface, if any func (n *Node) getWireGuardIP() (string, error) { iface, err := net.InterfaceByName("wg0") if err != nil { return "", err } addrs, err := iface.Addrs() if err != nil { return "", err } for _, addr := range addrs { if ipnet, ok := addr.(*net.IPNet); ok && ipnet.IP.To4() != nil { return ipnet.IP.String(), nil } } return "", fmt.Errorf("no IPv4 address on wg0") } // getNodeIPAddress attempts to determine the node's external IP address func (n *Node) getNodeIPAddress() (string, error) { // Try to detect external IP by connecting to a public server conn, err := net.Dial("udp", "8.8.8.8:80") if err != nil { // If that fails, try to get first non-loopback interface IP addrs, err := net.InterfaceAddrs() if err != nil { return "", err } for _, addr := range addrs { if ipnet, ok := addr.(*net.IPNet); ok && !ipnet.IP.IsLoopback() && !ipnet.IP.IsPrivate() { if ipnet.IP.To4() != nil { return ipnet.IP.String(), nil } } } return "", fmt.Errorf("no suitable IP address found") } defer conn.Close() localAddr := conn.LocalAddr().(*net.UDPAddr) if localAddr.IP.IsPrivate() || localAddr.IP.IsLoopback() { // UDP dial returned a private/loopback IP (e.g. WireGuard 10.0.0.x). // Fall back to scanning interfaces for a public IPv4. addrs, err := net.InterfaceAddrs() if err != nil { return "", fmt.Errorf("private IP detected (%s) and failed to list interfaces: %w", localAddr.IP, err) } for _, addr := range addrs { if ipnet, ok := addr.(*net.IPNet); ok && !ipnet.IP.IsLoopback() && !ipnet.IP.IsPrivate() { if ipnet.IP.To4() != nil { return ipnet.IP.String(), nil } } } return "", fmt.Errorf("private IP detected (%s) and no public IPv4 found on interfaces", localAddr.IP) } return localAddr.IP.String(), nil } // cleanupPrivateIPRecords deletes any A records with private/loopback IPs from dns_records. // Old code versions could insert WireGuard IPs (10.0.0.x) into the table. This runs on // every heartbeat to self-heal. func cleanupPrivateIPRecords(ctx context.Context, db *sql.DB, logger *logging.ColoredLogger) { query := `DELETE FROM dns_records WHERE record_type = 'A' AND namespace = 'system' AND (value LIKE '10.%' OR value LIKE '172.16.%' OR value LIKE '172.17.%' OR value LIKE '172.18.%' OR value LIKE '172.19.%' OR value LIKE '172.2_.%' OR value LIKE '172.30.%' OR value LIKE '172.31.%' OR value LIKE '192.168.%' OR value = '127.0.0.1')` result, err := db.ExecContext(ctx, query) if err != nil { logger.ComponentWarn(logging.ComponentNode, "Failed to clean up private IP DNS records", zap.Error(err)) return } if rows, _ := result.RowsAffected(); rows > 0 { logger.ComponentInfo(logging.ComponentNode, "Cleaned up private IP DNS records", zap.Int64("deleted", rows)) } }