mirror of
https://github.com/DeBrosOfficial/network.git
synced 2026-01-30 06:53:03 +00:00
fixed bugs on dns for deployment
This commit is contained in:
parent
d4f5f3b999
commit
5ec292a4f2
19
migrations/011_dns_nameservers.sql
Normal file
19
migrations/011_dns_nameservers.sql
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
-- Migration 011: DNS Nameservers Table
|
||||||
|
-- Maps NS hostnames (ns1, ns2, ns3) to specific node IDs and IPs
|
||||||
|
-- Provides stable NS assignment that survives restarts and re-seeding
|
||||||
|
|
||||||
|
BEGIN;
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS dns_nameservers (
|
||||||
|
hostname TEXT PRIMARY KEY, -- e.g., "ns1", "ns2", "ns3"
|
||||||
|
node_id TEXT NOT NULL, -- Peer ID of the assigned node
|
||||||
|
ip_address TEXT NOT NULL, -- IP address of the assigned node
|
||||||
|
domain TEXT NOT NULL, -- Base domain (e.g., "dbrs.space")
|
||||||
|
assigned_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
UNIQUE(node_id, domain) -- A node can only hold one NS slot per domain
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT OR IGNORE INTO schema_migrations(version) VALUES (11);
|
||||||
|
|
||||||
|
COMMIT;
|
||||||
@ -6,6 +6,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/DeBrosOfficial/network/pkg/cli/utils"
|
"github.com/DeBrosOfficial/network/pkg/cli/utils"
|
||||||
"github.com/DeBrosOfficial/network/pkg/environments/production"
|
"github.com/DeBrosOfficial/network/pkg/environments/production"
|
||||||
@ -167,6 +168,18 @@ func (o *Orchestrator) Execute() error {
|
|||||||
return fmt.Errorf("service creation failed: %w", err)
|
return fmt.Errorf("service creation failed: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Seed DNS records after services are running (RQLite must be up)
|
||||||
|
if o.flags.Nameserver && o.flags.BaseDomain != "" {
|
||||||
|
fmt.Printf("\n🌐 Phase 6: Seeding DNS records...\n")
|
||||||
|
fmt.Printf(" Waiting for RQLite to start (10s)...\n")
|
||||||
|
time.Sleep(10 * time.Second)
|
||||||
|
if err := o.setup.SeedDNSRecords(o.flags.BaseDomain, o.flags.VpsIP, o.peers); err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, " ⚠️ Warning: Failed to seed DNS records: %v\n", err)
|
||||||
|
} else {
|
||||||
|
fmt.Printf(" ✓ DNS records seeded\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Log completion with actual peer ID
|
// Log completion with actual peer ID
|
||||||
o.setup.LogSetupComplete(o.setup.NodePeerID)
|
o.setup.LogSetupComplete(o.setup.NodePeerID)
|
||||||
fmt.Printf("✅ Production installation complete!\n\n")
|
fmt.Printf("✅ Production installation complete!\n\n")
|
||||||
|
|||||||
@ -6,7 +6,6 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
@ -313,137 +312,61 @@ func (ci *CoreDNSInstaller) generateCorefile(domain, rqliteDSN string) string {
|
|||||||
`, domain, domain, rqliteDSN)
|
`, domain, domain, rqliteDSN)
|
||||||
}
|
}
|
||||||
|
|
||||||
// seedStaticRecords inserts static zone records into RQLite
|
// seedStaticRecords inserts static zone records into RQLite (non-destructive)
|
||||||
|
// Each node only adds its own IP to the round-robin. SOA and NS records are upserted idempotently.
|
||||||
func (ci *CoreDNSInstaller) seedStaticRecords(domain, rqliteDSN, ns1IP, ns2IP, ns3IP string) error {
|
func (ci *CoreDNSInstaller) seedStaticRecords(domain, rqliteDSN, ns1IP, ns2IP, ns3IP string) error {
|
||||||
// First, check if nameserver A records already exist with different IPs
|
|
||||||
// If so, we should preserve them instead of overwriting with potentially wrong IPs
|
|
||||||
existingNSIPs, err := ci.getExistingNameserverIPs(domain, rqliteDSN)
|
|
||||||
if err == nil && len(existingNSIPs) == 3 {
|
|
||||||
// Check if they have at least 2 different IPs (properly configured cluster)
|
|
||||||
uniqueIPs := make(map[string]bool)
|
|
||||||
for _, ip := range existingNSIPs {
|
|
||||||
uniqueIPs[ip] = true
|
|
||||||
}
|
|
||||||
if len(uniqueIPs) >= 2 {
|
|
||||||
// Nameserver records are already properly configured, use existing IPs
|
|
||||||
fmt.Fprintf(ci.logWriter, " Using existing nameserver IPs from database\n")
|
|
||||||
ns1IP = existingNSIPs[0]
|
|
||||||
ns2IP = existingNSIPs[1]
|
|
||||||
ns3IP = existingNSIPs[2]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Generate serial based on current date
|
// Generate serial based on current date
|
||||||
serial := fmt.Sprintf("%d", time.Now().Unix())
|
serial := fmt.Sprintf("%d", time.Now().Unix())
|
||||||
|
|
||||||
// SOA record format: "mname rname serial refresh retry expire minimum"
|
// SOA record format: "mname rname serial refresh retry expire minimum"
|
||||||
soaValue := fmt.Sprintf("ns1.%s. admin.%s. %s 3600 1800 604800 300", domain, domain, serial)
|
soaValue := fmt.Sprintf("ns1.%s. admin.%s. %s 3600 1800 604800 300", domain, domain, serial)
|
||||||
|
|
||||||
// First, delete existing system records to avoid duplicates
|
|
||||||
// We only delete system records, not deployment-created records
|
|
||||||
deleteStatements := []string{
|
|
||||||
fmt.Sprintf(`DELETE FROM dns_records WHERE namespace = 'system' AND fqdn = '%s.' AND record_type IN ('SOA', 'NS', 'A')`, domain),
|
|
||||||
fmt.Sprintf(`DELETE FROM dns_records WHERE namespace = 'system' AND fqdn = '*.%s.' AND record_type = 'A'`, domain),
|
|
||||||
fmt.Sprintf(`DELETE FROM dns_records WHERE namespace = 'system' AND fqdn LIKE 'ns%%.%s.' AND record_type = 'A'`, domain),
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := ci.executeRQLiteStatements(rqliteDSN, deleteStatements); err != nil {
|
|
||||||
return fmt.Errorf("failed to clean up old records: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Define all static records
|
|
||||||
records := []struct {
|
|
||||||
fqdn string
|
|
||||||
recordType string
|
|
||||||
value string
|
|
||||||
ttl int
|
|
||||||
}{
|
|
||||||
// SOA record
|
|
||||||
{domain + ".", "SOA", soaValue, 300},
|
|
||||||
|
|
||||||
// NS records
|
|
||||||
{domain + ".", "NS", "ns1." + domain + ".", 300},
|
|
||||||
{domain + ".", "NS", "ns2." + domain + ".", 300},
|
|
||||||
{domain + ".", "NS", "ns3." + domain + ".", 300},
|
|
||||||
|
|
||||||
// Nameserver A records (glue)
|
|
||||||
{"ns1." + domain + ".", "A", ns1IP, 300},
|
|
||||||
{"ns2." + domain + ".", "A", ns2IP, 300},
|
|
||||||
{"ns3." + domain + ".", "A", ns3IP, 300},
|
|
||||||
|
|
||||||
// Root domain A records (round-robin)
|
|
||||||
{domain + ".", "A", ns1IP, 300},
|
|
||||||
{domain + ".", "A", ns2IP, 300},
|
|
||||||
{domain + ".", "A", ns3IP, 300},
|
|
||||||
|
|
||||||
// Wildcard A records (round-robin)
|
|
||||||
{"*." + domain + ".", "A", ns1IP, 300},
|
|
||||||
{"*." + domain + ".", "A", ns2IP, 300},
|
|
||||||
{"*." + domain + ".", "A", ns3IP, 300},
|
|
||||||
}
|
|
||||||
|
|
||||||
// Build SQL statements
|
|
||||||
var statements []string
|
var statements []string
|
||||||
for _, r := range records {
|
|
||||||
// IMPORTANT: Must set is_active = TRUE for CoreDNS to find the records
|
// SOA record — delete old and insert new (serial changes each time, so value differs)
|
||||||
stmt := fmt.Sprintf(
|
statements = append(statements, fmt.Sprintf(
|
||||||
`INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) VALUES ('%s', '%s', '%s', %d, 'system', 'system', TRUE, datetime('now'), datetime('now'))`,
|
`DELETE FROM dns_records WHERE fqdn = '%s.' AND record_type = 'SOA' AND namespace = 'system'`,
|
||||||
r.fqdn, r.recordType, r.value, r.ttl,
|
domain,
|
||||||
)
|
))
|
||||||
statements = append(statements, stmt)
|
statements = append(statements, fmt.Sprintf(
|
||||||
|
`INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) VALUES ('%s.', 'SOA', '%s', 300, 'system', 'system', TRUE, datetime('now'), datetime('now'))`,
|
||||||
|
domain, soaValue,
|
||||||
|
))
|
||||||
|
|
||||||
|
// NS records — idempotent insert
|
||||||
|
for i := 1; i <= 3; i++ {
|
||||||
|
statements = append(statements, fmt.Sprintf(
|
||||||
|
`INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) VALUES ('%s.', 'NS', 'ns%d.%s.', 300, 'system', 'system', TRUE, datetime('now'), datetime('now')) ON CONFLICT(fqdn, record_type, value) DO NOTHING`,
|
||||||
|
domain, i, domain,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE: Nameserver glue A records (ns1/ns2/ns3) are NOT seeded here.
|
||||||
|
// They are managed by each node's claimNameserverSlot() on the heartbeat loop,
|
||||||
|
// which correctly maps each NS hostname to exactly one node's IP.
|
||||||
|
|
||||||
|
// Round-robin A records — each unique IP is added once (no duplicates due to UNIQUE constraint)
|
||||||
|
uniqueIPs := make(map[string]bool)
|
||||||
|
for _, ip := range []string{ns1IP, ns2IP, ns3IP} {
|
||||||
|
if !uniqueIPs[ip] {
|
||||||
|
uniqueIPs[ip] = true
|
||||||
|
// Root domain A record
|
||||||
|
statements = append(statements, fmt.Sprintf(
|
||||||
|
`INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) VALUES ('%s.', 'A', '%s', 300, 'system', 'system', TRUE, datetime('now'), datetime('now')) ON CONFLICT(fqdn, record_type, value) DO NOTHING`,
|
||||||
|
domain, ip,
|
||||||
|
))
|
||||||
|
// Wildcard A record
|
||||||
|
statements = append(statements, fmt.Sprintf(
|
||||||
|
`INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) VALUES ('*.%s.', 'A', '%s', 300, 'system', 'system', TRUE, datetime('now'), datetime('now')) ON CONFLICT(fqdn, record_type, value) DO NOTHING`,
|
||||||
|
domain, ip,
|
||||||
|
))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Execute via RQLite HTTP API
|
// Execute via RQLite HTTP API
|
||||||
return ci.executeRQLiteStatements(rqliteDSN, statements)
|
return ci.executeRQLiteStatements(rqliteDSN, statements)
|
||||||
}
|
}
|
||||||
|
|
||||||
// getExistingNameserverIPs queries RQLite for existing ns1, ns2, ns3 A record IPs
|
|
||||||
func (ci *CoreDNSInstaller) getExistingNameserverIPs(domain, rqliteDSN string) ([]string, error) {
|
|
||||||
// Build query - use url.QueryEscape to properly encode the SQL
|
|
||||||
query := fmt.Sprintf("SELECT fqdn, value FROM dns_records WHERE fqdn LIKE 'ns_.%s.' AND record_type = 'A' AND is_active = TRUE ORDER BY fqdn", domain)
|
|
||||||
queryURL := fmt.Sprintf("%s/db/query?q=%s", rqliteDSN, url.QueryEscape(query))
|
|
||||||
|
|
||||||
client := &http.Client{Timeout: 5 * time.Second}
|
|
||||||
resp, err := client.Get(queryURL)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
defer resp.Body.Close()
|
|
||||||
|
|
||||||
if resp.StatusCode != http.StatusOK {
|
|
||||||
return nil, fmt.Errorf("query failed with status %d", resp.StatusCode)
|
|
||||||
}
|
|
||||||
|
|
||||||
var result struct {
|
|
||||||
Results []struct {
|
|
||||||
Values [][]interface{} `json:"values"`
|
|
||||||
} `json:"results"`
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(result.Results) == 0 || result.Results[0].Values == nil || len(result.Results[0].Values) < 3 {
|
|
||||||
return nil, fmt.Errorf("not enough nameserver records found")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract IPs for ns1, ns2, ns3 (ordered by fqdn)
|
|
||||||
ips := make([]string, 0, 3)
|
|
||||||
for _, row := range result.Results[0].Values {
|
|
||||||
if len(row) >= 2 {
|
|
||||||
if ip, ok := row[1].(string); ok {
|
|
||||||
ips = append(ips, ip)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(ips) != 3 {
|
|
||||||
return nil, fmt.Errorf("expected 3 nameserver IPs, got %d", len(ips))
|
|
||||||
}
|
|
||||||
|
|
||||||
return ips, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// rqliteResult represents the response from RQLite execute endpoint
|
// rqliteResult represents the response from RQLite execute endpoint
|
||||||
type rqliteResult struct {
|
type rqliteResult struct {
|
||||||
|
|||||||
@ -78,6 +78,12 @@ func (n *Node) startDNSHeartbeat(ctx context.Context) {
|
|||||||
if err := n.updateDNSHeartbeat(ctx); err != nil {
|
if err := n.updateDNSHeartbeat(ctx); err != nil {
|
||||||
n.logger.ComponentWarn(logging.ComponentNode, "Failed to update DNS heartbeat", zap.Error(err))
|
n.logger.ComponentWarn(logging.ComponentNode, "Failed to update DNS heartbeat", zap.Error(err))
|
||||||
}
|
}
|
||||||
|
// Self-healing: ensure this node's DNS records exist on every heartbeat
|
||||||
|
if err := n.ensureBaseDNSRecords(ctx); err != nil {
|
||||||
|
n.logger.ComponentWarn(logging.ComponentNode, "Failed to ensure DNS records on heartbeat", zap.Error(err))
|
||||||
|
}
|
||||||
|
// Remove DNS records for nodes that stopped heartbeating
|
||||||
|
n.cleanupStaleNodeRecords(ctx)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
@ -106,6 +112,191 @@ func (n *Node) updateDNSHeartbeat(ctx context.Context) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ensureBaseDNSRecords ensures this node's IP is present in the base DNS records.
|
||||||
|
// This provides self-healing: if records are missing (fresh install, DB reset),
|
||||||
|
// the node recreates them on startup. Each node only manages its own IP entries.
|
||||||
|
func (n *Node) ensureBaseDNSRecords(ctx context.Context) error {
|
||||||
|
domain := n.config.Node.Domain
|
||||||
|
if domain == "" {
|
||||||
|
domain = n.config.HTTPGateway.BaseDomain
|
||||||
|
}
|
||||||
|
if domain == "" {
|
||||||
|
return nil // No domain configured, skip
|
||||||
|
}
|
||||||
|
|
||||||
|
ipAddress, err := n.getNodeIPAddress()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to determine node IP: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure trailing dot for FQDN format (as CoreDNS expects)
|
||||||
|
fqdn := domain + "."
|
||||||
|
wildcardFQDN := "*." + domain + "."
|
||||||
|
|
||||||
|
db := n.rqliteAdapter.GetSQLDB()
|
||||||
|
|
||||||
|
// Insert root A record and wildcard A record for this node's IP
|
||||||
|
// ON CONFLICT DO NOTHING avoids duplicates (UNIQUE on fqdn, record_type, value)
|
||||||
|
records := []struct {
|
||||||
|
fqdn string
|
||||||
|
value string
|
||||||
|
}{
|
||||||
|
{fqdn, ipAddress},
|
||||||
|
{wildcardFQDN, ipAddress},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, r := range records {
|
||||||
|
query := `INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at)
|
||||||
|
VALUES (?, 'A', ?, 300, 'system', 'system', TRUE, datetime('now'), datetime('now'))
|
||||||
|
ON CONFLICT(fqdn, record_type, value) DO NOTHING`
|
||||||
|
if _, err := db.ExecContext(ctx, query, r.fqdn, r.value); err != nil {
|
||||||
|
n.logger.ComponentWarn(logging.ComponentNode, "Failed to ensure DNS record",
|
||||||
|
zap.String("fqdn", r.fqdn), zap.Error(err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Claim an NS slot if available (ns1, ns2, or ns3)
|
||||||
|
n.claimNameserverSlot(ctx, domain, ipAddress)
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// claimNameserverSlot attempts to claim an available NS hostname (ns1/ns2/ns3) for this node.
|
||||||
|
// If the node already has a slot, it updates the IP. If no slot is available, it does nothing.
|
||||||
|
func (n *Node) claimNameserverSlot(ctx context.Context, domain, ipAddress string) {
|
||||||
|
nodeID := n.GetPeerID()
|
||||||
|
db := n.rqliteAdapter.GetSQLDB()
|
||||||
|
|
||||||
|
// Check if this node already has a slot
|
||||||
|
var existingHostname string
|
||||||
|
err := db.QueryRowContext(ctx,
|
||||||
|
`SELECT hostname FROM dns_nameservers WHERE node_id = ? AND domain = ?`,
|
||||||
|
nodeID, domain,
|
||||||
|
).Scan(&existingHostname)
|
||||||
|
|
||||||
|
if err == nil {
|
||||||
|
// Already claimed — update IP if changed
|
||||||
|
if _, err := db.ExecContext(ctx,
|
||||||
|
`UPDATE dns_nameservers SET ip_address = ?, updated_at = datetime('now') WHERE hostname = ? AND domain = ?`,
|
||||||
|
ipAddress, existingHostname, domain,
|
||||||
|
); err != nil {
|
||||||
|
n.logger.ComponentWarn(logging.ComponentNode, "Failed to update NS slot IP", zap.Error(err))
|
||||||
|
}
|
||||||
|
// Ensure the glue A record matches
|
||||||
|
nsFQDN := existingHostname + "." + domain + "."
|
||||||
|
if _, err := db.ExecContext(ctx,
|
||||||
|
`INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at)
|
||||||
|
VALUES (?, 'A', ?, 300, 'system', 'system', TRUE, datetime('now'), datetime('now'))
|
||||||
|
ON CONFLICT(fqdn, record_type, value) DO NOTHING`,
|
||||||
|
nsFQDN, ipAddress,
|
||||||
|
); err != nil {
|
||||||
|
n.logger.ComponentWarn(logging.ComponentNode, "Failed to ensure NS glue record", zap.Error(err))
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to claim an available slot
|
||||||
|
for _, hostname := range []string{"ns1", "ns2", "ns3"} {
|
||||||
|
result, err := db.ExecContext(ctx,
|
||||||
|
`INSERT INTO dns_nameservers (hostname, node_id, ip_address, domain) VALUES (?, ?, ?, ?)
|
||||||
|
ON CONFLICT(hostname) DO NOTHING`,
|
||||||
|
hostname, nodeID, ipAddress, domain,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
rows, _ := result.RowsAffected()
|
||||||
|
if rows > 0 {
|
||||||
|
// Successfully claimed this slot — create glue record
|
||||||
|
nsFQDN := hostname + "." + domain + "."
|
||||||
|
if _, err := db.ExecContext(ctx,
|
||||||
|
`INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at)
|
||||||
|
VALUES (?, 'A', ?, 300, 'system', 'system', TRUE, datetime('now'), datetime('now'))
|
||||||
|
ON CONFLICT(fqdn, record_type, value) DO NOTHING`,
|
||||||
|
nsFQDN, ipAddress,
|
||||||
|
); err != nil {
|
||||||
|
n.logger.ComponentWarn(logging.ComponentNode, "Failed to create NS glue record", zap.Error(err))
|
||||||
|
}
|
||||||
|
n.logger.ComponentInfo(logging.ComponentNode, "Claimed NS slot",
|
||||||
|
zap.String("hostname", hostname),
|
||||||
|
zap.String("ip", ipAddress),
|
||||||
|
)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// cleanupStaleNodeRecords removes A records for nodes that have stopped heartbeating.
|
||||||
|
// This ensures DNS only returns IPs for healthy, active nodes.
|
||||||
|
func (n *Node) cleanupStaleNodeRecords(ctx context.Context) {
|
||||||
|
if n.rqliteAdapter == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
domain := n.config.Node.Domain
|
||||||
|
if domain == "" {
|
||||||
|
domain = n.config.HTTPGateway.BaseDomain
|
||||||
|
}
|
||||||
|
if domain == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
db := n.rqliteAdapter.GetSQLDB()
|
||||||
|
|
||||||
|
// Find nodes that haven't sent a heartbeat in over 2 minutes
|
||||||
|
staleQuery := `SELECT id, ip_address FROM dns_nodes WHERE status = 'active' AND last_seen < datetime('now', '-120 seconds')`
|
||||||
|
rows, err := db.QueryContext(ctx, staleQuery)
|
||||||
|
if err != nil {
|
||||||
|
n.logger.ComponentWarn(logging.ComponentNode, "Failed to query stale nodes", zap.Error(err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
fqdn := domain + "."
|
||||||
|
wildcardFQDN := "*." + domain + "."
|
||||||
|
|
||||||
|
for rows.Next() {
|
||||||
|
var nodeID, ip string
|
||||||
|
if err := rows.Scan(&nodeID, &ip); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mark node as inactive
|
||||||
|
if _, err := db.ExecContext(ctx, `UPDATE dns_nodes SET status = 'inactive', updated_at = datetime('now') WHERE id = ?`, nodeID); err != nil {
|
||||||
|
n.logger.ComponentWarn(logging.ComponentNode, "Failed to mark node inactive", zap.String("node_id", nodeID), zap.Error(err))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove the dead node's A records from round-robin
|
||||||
|
for _, f := range []string{fqdn, wildcardFQDN} {
|
||||||
|
if _, err := db.ExecContext(ctx, `DELETE FROM dns_records WHERE fqdn = ? AND record_type = 'A' AND value = ? AND namespace = 'system'`, f, ip); err != nil {
|
||||||
|
n.logger.ComponentWarn(logging.ComponentNode, "Failed to remove stale DNS record",
|
||||||
|
zap.String("fqdn", f), zap.String("ip", ip), zap.Error(err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Release any NS slot held by this dead node
|
||||||
|
if _, err := db.ExecContext(ctx, `DELETE FROM dns_nameservers WHERE node_id = ?`, nodeID); err != nil {
|
||||||
|
n.logger.ComponentWarn(logging.ComponentNode, "Failed to release NS slot", zap.String("node_id", nodeID), zap.Error(err))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove glue records for this node's IP (ns1.domain., ns2.domain., ns3.domain.)
|
||||||
|
for _, ns := range []string{"ns1", "ns2", "ns3"} {
|
||||||
|
nsFQDN := ns + "." + domain + "."
|
||||||
|
if _, err := db.ExecContext(ctx,
|
||||||
|
`DELETE FROM dns_records WHERE fqdn = ? AND record_type = 'A' AND value = ? AND namespace = 'system'`,
|
||||||
|
nsFQDN, ip,
|
||||||
|
); err != nil {
|
||||||
|
n.logger.ComponentWarn(logging.ComponentNode, "Failed to remove NS glue record", zap.Error(err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
n.logger.ComponentInfo(logging.ComponentNode, "Removed stale node from DNS",
|
||||||
|
zap.String("node_id", nodeID),
|
||||||
|
zap.String("ip", ip),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// getNodeIPAddress attempts to determine the node's external IP address
|
// getNodeIPAddress attempts to determine the node's external IP address
|
||||||
func (n *Node) getNodeIPAddress() (string, error) {
|
func (n *Node) getNodeIPAddress() (string, error) {
|
||||||
// Try to detect external IP by connecting to a public server
|
// Try to detect external IP by connecting to a public server
|
||||||
|
|||||||
@ -110,6 +110,11 @@ func (n *Node) Start(ctx context.Context) error {
|
|||||||
} else {
|
} else {
|
||||||
// Start DNS heartbeat to keep node status fresh
|
// Start DNS heartbeat to keep node status fresh
|
||||||
n.startDNSHeartbeat(ctx)
|
n.startDNSHeartbeat(ctx)
|
||||||
|
|
||||||
|
// Ensure base DNS records exist for this node (self-healing)
|
||||||
|
if err := n.ensureBaseDNSRecords(ctx); err != nil {
|
||||||
|
n.logger.ComponentWarn(logging.ComponentNode, "Failed to ensure base DNS records", zap.Error(err))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get listen addresses for logging
|
// Get listen addresses for logging
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user