fixed bugs on dns for deployment

This commit is contained in:
anonpenguin23 2026-01-29 07:22:32 +02:00
parent d4f5f3b999
commit 5ec292a4f2
5 changed files with 269 additions and 118 deletions

View File

@ -0,0 +1,19 @@
-- Migration 011: DNS Nameservers Table
-- Maps NS hostnames (ns1, ns2, ns3) to specific node IDs and IPs
-- Provides stable NS assignment that survives restarts and re-seeding
BEGIN;
CREATE TABLE IF NOT EXISTS dns_nameservers (
hostname TEXT PRIMARY KEY, -- e.g., "ns1", "ns2", "ns3"
node_id TEXT NOT NULL, -- Peer ID of the assigned node
ip_address TEXT NOT NULL, -- IP address of the assigned node
domain TEXT NOT NULL, -- Base domain (e.g., "dbrs.space")
assigned_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
UNIQUE(node_id, domain) -- A node can only hold one NS slot per domain
);
INSERT OR IGNORE INTO schema_migrations(version) VALUES (11);
COMMIT;

View File

@ -6,6 +6,7 @@ import (
"os"
"path/filepath"
"strings"
"time"
"github.com/DeBrosOfficial/network/pkg/cli/utils"
"github.com/DeBrosOfficial/network/pkg/environments/production"
@ -167,6 +168,18 @@ func (o *Orchestrator) Execute() error {
return fmt.Errorf("service creation failed: %w", err)
}
// Seed DNS records after services are running (RQLite must be up)
if o.flags.Nameserver && o.flags.BaseDomain != "" {
fmt.Printf("\n🌐 Phase 6: Seeding DNS records...\n")
fmt.Printf(" Waiting for RQLite to start (10s)...\n")
time.Sleep(10 * time.Second)
if err := o.setup.SeedDNSRecords(o.flags.BaseDomain, o.flags.VpsIP, o.peers); err != nil {
fmt.Fprintf(os.Stderr, " ⚠️ Warning: Failed to seed DNS records: %v\n", err)
} else {
fmt.Printf(" ✓ DNS records seeded\n")
}
}
// Log completion with actual peer ID
o.setup.LogSetupComplete(o.setup.NodePeerID)
fmt.Printf("✅ Production installation complete!\n\n")

View File

@ -6,7 +6,6 @@ import (
"fmt"
"io"
"net/http"
"net/url"
"os"
"os/exec"
"path/filepath"
@ -313,137 +312,61 @@ func (ci *CoreDNSInstaller) generateCorefile(domain, rqliteDSN string) string {
`, domain, domain, rqliteDSN)
}
// seedStaticRecords inserts static zone records into RQLite
// seedStaticRecords inserts static zone records into RQLite (non-destructive)
// Each node only adds its own IP to the round-robin. SOA and NS records are upserted idempotently.
func (ci *CoreDNSInstaller) seedStaticRecords(domain, rqliteDSN, ns1IP, ns2IP, ns3IP string) error {
// First, check if nameserver A records already exist with different IPs
// If so, we should preserve them instead of overwriting with potentially wrong IPs
existingNSIPs, err := ci.getExistingNameserverIPs(domain, rqliteDSN)
if err == nil && len(existingNSIPs) == 3 {
// Check if they have at least 2 different IPs (properly configured cluster)
uniqueIPs := make(map[string]bool)
for _, ip := range existingNSIPs {
uniqueIPs[ip] = true
}
if len(uniqueIPs) >= 2 {
// Nameserver records are already properly configured, use existing IPs
fmt.Fprintf(ci.logWriter, " Using existing nameserver IPs from database\n")
ns1IP = existingNSIPs[0]
ns2IP = existingNSIPs[1]
ns3IP = existingNSIPs[2]
}
}
// Generate serial based on current date
serial := fmt.Sprintf("%d", time.Now().Unix())
// SOA record format: "mname rname serial refresh retry expire minimum"
soaValue := fmt.Sprintf("ns1.%s. admin.%s. %s 3600 1800 604800 300", domain, domain, serial)
// First, delete existing system records to avoid duplicates
// We only delete system records, not deployment-created records
deleteStatements := []string{
fmt.Sprintf(`DELETE FROM dns_records WHERE namespace = 'system' AND fqdn = '%s.' AND record_type IN ('SOA', 'NS', 'A')`, domain),
fmt.Sprintf(`DELETE FROM dns_records WHERE namespace = 'system' AND fqdn = '*.%s.' AND record_type = 'A'`, domain),
fmt.Sprintf(`DELETE FROM dns_records WHERE namespace = 'system' AND fqdn LIKE 'ns%%.%s.' AND record_type = 'A'`, domain),
}
if err := ci.executeRQLiteStatements(rqliteDSN, deleteStatements); err != nil {
return fmt.Errorf("failed to clean up old records: %w", err)
}
// Define all static records
records := []struct {
fqdn string
recordType string
value string
ttl int
}{
// SOA record
{domain + ".", "SOA", soaValue, 300},
// NS records
{domain + ".", "NS", "ns1." + domain + ".", 300},
{domain + ".", "NS", "ns2." + domain + ".", 300},
{domain + ".", "NS", "ns3." + domain + ".", 300},
// Nameserver A records (glue)
{"ns1." + domain + ".", "A", ns1IP, 300},
{"ns2." + domain + ".", "A", ns2IP, 300},
{"ns3." + domain + ".", "A", ns3IP, 300},
// Root domain A records (round-robin)
{domain + ".", "A", ns1IP, 300},
{domain + ".", "A", ns2IP, 300},
{domain + ".", "A", ns3IP, 300},
// Wildcard A records (round-robin)
{"*." + domain + ".", "A", ns1IP, 300},
{"*." + domain + ".", "A", ns2IP, 300},
{"*." + domain + ".", "A", ns3IP, 300},
}
// Build SQL statements
var statements []string
for _, r := range records {
// IMPORTANT: Must set is_active = TRUE for CoreDNS to find the records
stmt := fmt.Sprintf(
`INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) VALUES ('%s', '%s', '%s', %d, 'system', 'system', TRUE, datetime('now'), datetime('now'))`,
r.fqdn, r.recordType, r.value, r.ttl,
)
statements = append(statements, stmt)
// SOA record — delete old and insert new (serial changes each time, so value differs)
statements = append(statements, fmt.Sprintf(
`DELETE FROM dns_records WHERE fqdn = '%s.' AND record_type = 'SOA' AND namespace = 'system'`,
domain,
))
statements = append(statements, fmt.Sprintf(
`INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) VALUES ('%s.', 'SOA', '%s', 300, 'system', 'system', TRUE, datetime('now'), datetime('now'))`,
domain, soaValue,
))
// NS records — idempotent insert
for i := 1; i <= 3; i++ {
statements = append(statements, fmt.Sprintf(
`INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) VALUES ('%s.', 'NS', 'ns%d.%s.', 300, 'system', 'system', TRUE, datetime('now'), datetime('now')) ON CONFLICT(fqdn, record_type, value) DO NOTHING`,
domain, i, domain,
))
}
// NOTE: Nameserver glue A records (ns1/ns2/ns3) are NOT seeded here.
// They are managed by each node's claimNameserverSlot() on the heartbeat loop,
// which correctly maps each NS hostname to exactly one node's IP.
// Round-robin A records — each unique IP is added once (no duplicates due to UNIQUE constraint)
uniqueIPs := make(map[string]bool)
for _, ip := range []string{ns1IP, ns2IP, ns3IP} {
if !uniqueIPs[ip] {
uniqueIPs[ip] = true
// Root domain A record
statements = append(statements, fmt.Sprintf(
`INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) VALUES ('%s.', 'A', '%s', 300, 'system', 'system', TRUE, datetime('now'), datetime('now')) ON CONFLICT(fqdn, record_type, value) DO NOTHING`,
domain, ip,
))
// Wildcard A record
statements = append(statements, fmt.Sprintf(
`INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at) VALUES ('*.%s.', 'A', '%s', 300, 'system', 'system', TRUE, datetime('now'), datetime('now')) ON CONFLICT(fqdn, record_type, value) DO NOTHING`,
domain, ip,
))
}
}
// Execute via RQLite HTTP API
return ci.executeRQLiteStatements(rqliteDSN, statements)
}
// getExistingNameserverIPs queries RQLite for existing ns1, ns2, ns3 A record IPs
func (ci *CoreDNSInstaller) getExistingNameserverIPs(domain, rqliteDSN string) ([]string, error) {
// Build query - use url.QueryEscape to properly encode the SQL
query := fmt.Sprintf("SELECT fqdn, value FROM dns_records WHERE fqdn LIKE 'ns_.%s.' AND record_type = 'A' AND is_active = TRUE ORDER BY fqdn", domain)
queryURL := fmt.Sprintf("%s/db/query?q=%s", rqliteDSN, url.QueryEscape(query))
client := &http.Client{Timeout: 5 * time.Second}
resp, err := client.Get(queryURL)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("query failed with status %d", resp.StatusCode)
}
var result struct {
Results []struct {
Values [][]interface{} `json:"values"`
} `json:"results"`
}
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return nil, err
}
if len(result.Results) == 0 || result.Results[0].Values == nil || len(result.Results[0].Values) < 3 {
return nil, fmt.Errorf("not enough nameserver records found")
}
// Extract IPs for ns1, ns2, ns3 (ordered by fqdn)
ips := make([]string, 0, 3)
for _, row := range result.Results[0].Values {
if len(row) >= 2 {
if ip, ok := row[1].(string); ok {
ips = append(ips, ip)
}
}
}
if len(ips) != 3 {
return nil, fmt.Errorf("expected 3 nameserver IPs, got %d", len(ips))
}
return ips, nil
}
// rqliteResult represents the response from RQLite execute endpoint
type rqliteResult struct {

View File

@ -78,6 +78,12 @@ func (n *Node) startDNSHeartbeat(ctx context.Context) {
if err := n.updateDNSHeartbeat(ctx); err != nil {
n.logger.ComponentWarn(logging.ComponentNode, "Failed to update DNS heartbeat", zap.Error(err))
}
// Self-healing: ensure this node's DNS records exist on every heartbeat
if err := n.ensureBaseDNSRecords(ctx); err != nil {
n.logger.ComponentWarn(logging.ComponentNode, "Failed to ensure DNS records on heartbeat", zap.Error(err))
}
// Remove DNS records for nodes that stopped heartbeating
n.cleanupStaleNodeRecords(ctx)
}
}
}()
@ -106,6 +112,191 @@ func (n *Node) updateDNSHeartbeat(ctx context.Context) error {
return nil
}
// ensureBaseDNSRecords ensures this node's IP is present in the base DNS records.
// This provides self-healing: if records are missing (fresh install, DB reset),
// the node recreates them on startup. Each node only manages its own IP entries.
func (n *Node) ensureBaseDNSRecords(ctx context.Context) error {
domain := n.config.Node.Domain
if domain == "" {
domain = n.config.HTTPGateway.BaseDomain
}
if domain == "" {
return nil // No domain configured, skip
}
ipAddress, err := n.getNodeIPAddress()
if err != nil {
return fmt.Errorf("failed to determine node IP: %w", err)
}
// Ensure trailing dot for FQDN format (as CoreDNS expects)
fqdn := domain + "."
wildcardFQDN := "*." + domain + "."
db := n.rqliteAdapter.GetSQLDB()
// Insert root A record and wildcard A record for this node's IP
// ON CONFLICT DO NOTHING avoids duplicates (UNIQUE on fqdn, record_type, value)
records := []struct {
fqdn string
value string
}{
{fqdn, ipAddress},
{wildcardFQDN, ipAddress},
}
for _, r := range records {
query := `INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at)
VALUES (?, 'A', ?, 300, 'system', 'system', TRUE, datetime('now'), datetime('now'))
ON CONFLICT(fqdn, record_type, value) DO NOTHING`
if _, err := db.ExecContext(ctx, query, r.fqdn, r.value); err != nil {
n.logger.ComponentWarn(logging.ComponentNode, "Failed to ensure DNS record",
zap.String("fqdn", r.fqdn), zap.Error(err))
}
}
// Claim an NS slot if available (ns1, ns2, or ns3)
n.claimNameserverSlot(ctx, domain, ipAddress)
return nil
}
// claimNameserverSlot attempts to claim an available NS hostname (ns1/ns2/ns3) for this node.
// If the node already has a slot, it updates the IP. If no slot is available, it does nothing.
func (n *Node) claimNameserverSlot(ctx context.Context, domain, ipAddress string) {
nodeID := n.GetPeerID()
db := n.rqliteAdapter.GetSQLDB()
// Check if this node already has a slot
var existingHostname string
err := db.QueryRowContext(ctx,
`SELECT hostname FROM dns_nameservers WHERE node_id = ? AND domain = ?`,
nodeID, domain,
).Scan(&existingHostname)
if err == nil {
// Already claimed — update IP if changed
if _, err := db.ExecContext(ctx,
`UPDATE dns_nameservers SET ip_address = ?, updated_at = datetime('now') WHERE hostname = ? AND domain = ?`,
ipAddress, existingHostname, domain,
); err != nil {
n.logger.ComponentWarn(logging.ComponentNode, "Failed to update NS slot IP", zap.Error(err))
}
// Ensure the glue A record matches
nsFQDN := existingHostname + "." + domain + "."
if _, err := db.ExecContext(ctx,
`INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at)
VALUES (?, 'A', ?, 300, 'system', 'system', TRUE, datetime('now'), datetime('now'))
ON CONFLICT(fqdn, record_type, value) DO NOTHING`,
nsFQDN, ipAddress,
); err != nil {
n.logger.ComponentWarn(logging.ComponentNode, "Failed to ensure NS glue record", zap.Error(err))
}
return
}
// Try to claim an available slot
for _, hostname := range []string{"ns1", "ns2", "ns3"} {
result, err := db.ExecContext(ctx,
`INSERT INTO dns_nameservers (hostname, node_id, ip_address, domain) VALUES (?, ?, ?, ?)
ON CONFLICT(hostname) DO NOTHING`,
hostname, nodeID, ipAddress, domain,
)
if err != nil {
continue
}
rows, _ := result.RowsAffected()
if rows > 0 {
// Successfully claimed this slot — create glue record
nsFQDN := hostname + "." + domain + "."
if _, err := db.ExecContext(ctx,
`INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at)
VALUES (?, 'A', ?, 300, 'system', 'system', TRUE, datetime('now'), datetime('now'))
ON CONFLICT(fqdn, record_type, value) DO NOTHING`,
nsFQDN, ipAddress,
); err != nil {
n.logger.ComponentWarn(logging.ComponentNode, "Failed to create NS glue record", zap.Error(err))
}
n.logger.ComponentInfo(logging.ComponentNode, "Claimed NS slot",
zap.String("hostname", hostname),
zap.String("ip", ipAddress),
)
return
}
}
}
// cleanupStaleNodeRecords removes A records for nodes that have stopped heartbeating.
// This ensures DNS only returns IPs for healthy, active nodes.
func (n *Node) cleanupStaleNodeRecords(ctx context.Context) {
if n.rqliteAdapter == nil {
return
}
domain := n.config.Node.Domain
if domain == "" {
domain = n.config.HTTPGateway.BaseDomain
}
if domain == "" {
return
}
db := n.rqliteAdapter.GetSQLDB()
// Find nodes that haven't sent a heartbeat in over 2 minutes
staleQuery := `SELECT id, ip_address FROM dns_nodes WHERE status = 'active' AND last_seen < datetime('now', '-120 seconds')`
rows, err := db.QueryContext(ctx, staleQuery)
if err != nil {
n.logger.ComponentWarn(logging.ComponentNode, "Failed to query stale nodes", zap.Error(err))
return
}
defer rows.Close()
fqdn := domain + "."
wildcardFQDN := "*." + domain + "."
for rows.Next() {
var nodeID, ip string
if err := rows.Scan(&nodeID, &ip); err != nil {
continue
}
// Mark node as inactive
if _, err := db.ExecContext(ctx, `UPDATE dns_nodes SET status = 'inactive', updated_at = datetime('now') WHERE id = ?`, nodeID); err != nil {
n.logger.ComponentWarn(logging.ComponentNode, "Failed to mark node inactive", zap.String("node_id", nodeID), zap.Error(err))
}
// Remove the dead node's A records from round-robin
for _, f := range []string{fqdn, wildcardFQDN} {
if _, err := db.ExecContext(ctx, `DELETE FROM dns_records WHERE fqdn = ? AND record_type = 'A' AND value = ? AND namespace = 'system'`, f, ip); err != nil {
n.logger.ComponentWarn(logging.ComponentNode, "Failed to remove stale DNS record",
zap.String("fqdn", f), zap.String("ip", ip), zap.Error(err))
}
}
// Release any NS slot held by this dead node
if _, err := db.ExecContext(ctx, `DELETE FROM dns_nameservers WHERE node_id = ?`, nodeID); err != nil {
n.logger.ComponentWarn(logging.ComponentNode, "Failed to release NS slot", zap.String("node_id", nodeID), zap.Error(err))
}
// Remove glue records for this node's IP (ns1.domain., ns2.domain., ns3.domain.)
for _, ns := range []string{"ns1", "ns2", "ns3"} {
nsFQDN := ns + "." + domain + "."
if _, err := db.ExecContext(ctx,
`DELETE FROM dns_records WHERE fqdn = ? AND record_type = 'A' AND value = ? AND namespace = 'system'`,
nsFQDN, ip,
); err != nil {
n.logger.ComponentWarn(logging.ComponentNode, "Failed to remove NS glue record", zap.Error(err))
}
}
n.logger.ComponentInfo(logging.ComponentNode, "Removed stale node from DNS",
zap.String("node_id", nodeID),
zap.String("ip", ip),
)
}
}
// getNodeIPAddress attempts to determine the node's external IP address
func (n *Node) getNodeIPAddress() (string, error) {
// Try to detect external IP by connecting to a public server

View File

@ -110,6 +110,11 @@ func (n *Node) Start(ctx context.Context) error {
} else {
// Start DNS heartbeat to keep node status fresh
n.startDNSHeartbeat(ctx)
// Ensure base DNS records exist for this node (self-healing)
if err := n.ensureBaseDNSRecords(ctx); err != nil {
n.logger.ComponentWarn(logging.ComponentNode, "Failed to ensure base DNS records", zap.Error(err))
}
}
// Get listen addresses for logging