mirror of
https://github.com/DeBrosOfficial/orama.git
synced 2026-03-17 04:53:00 +00:00
Updated docs and fixed WG bugs and ip's bugs
This commit is contained in:
parent
4acea72467
commit
810094771d
141
docs/CLEAN_NODE.md
Normal file
141
docs/CLEAN_NODE.md
Normal file
@ -0,0 +1,141 @@
|
||||
# Clean Node — Full Reset Guide
|
||||
|
||||
How to completely remove all Orama Network state from a VPS so it can be reinstalled fresh.
|
||||
|
||||
## Quick Clean (Copy-Paste)
|
||||
|
||||
Run this as root or with sudo on the target VPS:
|
||||
|
||||
```bash
|
||||
# 1. Stop and disable all services
|
||||
sudo systemctl stop debros-node debros-ipfs debros-ipfs-cluster debros-olric coredns caddy 2>/dev/null
|
||||
sudo systemctl disable debros-node debros-ipfs debros-ipfs-cluster debros-olric coredns caddy 2>/dev/null
|
||||
|
||||
# 2. Remove systemd service files
|
||||
sudo rm -f /etc/systemd/system/debros-*.service
|
||||
sudo rm -f /etc/systemd/system/coredns.service
|
||||
sudo rm -f /etc/systemd/system/caddy.service
|
||||
sudo systemctl daemon-reload
|
||||
|
||||
# 3. Tear down WireGuard
|
||||
# Must stop the systemd unit first — wg-quick@wg0 is a oneshot with
|
||||
# RemainAfterExit=yes, so it stays "active (exited)" even after the
|
||||
# interface is removed. Without "stop", a future "systemctl start" is a no-op.
|
||||
sudo systemctl stop wg-quick@wg0 2>/dev/null
|
||||
sudo wg-quick down wg0 2>/dev/null
|
||||
sudo systemctl disable wg-quick@wg0 2>/dev/null
|
||||
sudo rm -f /etc/wireguard/wg0.conf
|
||||
|
||||
# 4. Reset UFW firewall
|
||||
sudo ufw --force reset
|
||||
sudo ufw allow 22/tcp
|
||||
sudo ufw --force enable
|
||||
|
||||
# 5. Remove debros user and home directory
|
||||
sudo userdel -r debros 2>/dev/null
|
||||
sudo rm -rf /home/debros
|
||||
|
||||
# 6. Remove sudoers files
|
||||
sudo rm -f /etc/sudoers.d/debros-access
|
||||
sudo rm -f /etc/sudoers.d/debros-deployments
|
||||
sudo rm -f /etc/sudoers.d/debros-wireguard
|
||||
|
||||
# 7. Remove CoreDNS config
|
||||
sudo rm -rf /etc/coredns
|
||||
|
||||
# 8. Remove Caddy config and data
|
||||
sudo rm -rf /etc/caddy
|
||||
sudo rm -rf /var/lib/caddy
|
||||
|
||||
# 9. Remove deployment systemd services (dynamic)
|
||||
sudo rm -f /etc/systemd/system/orama-deploy-*.service
|
||||
sudo systemctl daemon-reload
|
||||
|
||||
# 10. Clean temp files
|
||||
sudo rm -f /tmp/orama /tmp/network-source.tar.gz /tmp/network-source.zip
|
||||
sudo rm -rf /tmp/network-extract /tmp/coredns-build /tmp/caddy-build
|
||||
|
||||
echo "Node cleaned. Ready for fresh install."
|
||||
```
|
||||
|
||||
## What This Removes
|
||||
|
||||
| Category | Paths |
|
||||
|----------|-------|
|
||||
| **User** | `debros` system user and `/home/debros/` |
|
||||
| **App data** | `/home/debros/.orama/` (configs, secrets, logs, IPFS, RQLite, Olric) |
|
||||
| **Source code** | `/home/debros/src/` |
|
||||
| **Binaries** | `/home/debros/bin/orama-node`, `/home/debros/bin/gateway` |
|
||||
| **Systemd** | `debros-*.service`, `coredns.service`, `caddy.service`, `orama-deploy-*.service` |
|
||||
| **WireGuard** | `/etc/wireguard/wg0.conf`, `wg-quick@wg0` systemd unit |
|
||||
| **Firewall** | All UFW rules (reset to default + SSH only) |
|
||||
| **Sudoers** | `/etc/sudoers.d/debros-*` |
|
||||
| **CoreDNS** | `/etc/coredns/Corefile` |
|
||||
| **Caddy** | `/etc/caddy/Caddyfile`, `/var/lib/caddy/` (TLS certs) |
|
||||
| **Temp files** | `/tmp/orama`, `/tmp/network-source.*`, build dirs |
|
||||
|
||||
## What This Does NOT Remove
|
||||
|
||||
These are shared system tools that may be used by other software. Remove manually if desired:
|
||||
|
||||
| Binary | Path | Remove Command |
|
||||
|--------|------|----------------|
|
||||
| RQLite | `/usr/local/bin/rqlited` | `sudo rm /usr/local/bin/rqlited` |
|
||||
| IPFS | `/usr/local/bin/ipfs` | `sudo rm /usr/local/bin/ipfs` |
|
||||
| IPFS Cluster | `/usr/local/bin/ipfs-cluster-service` | `sudo rm /usr/local/bin/ipfs-cluster-service` |
|
||||
| Olric | `/usr/local/bin/olric-server` | `sudo rm /usr/local/bin/olric-server` |
|
||||
| CoreDNS | `/usr/local/bin/coredns` | `sudo rm /usr/local/bin/coredns` |
|
||||
| Caddy | `/usr/bin/caddy` | `sudo rm /usr/bin/caddy` |
|
||||
| xcaddy | `/usr/local/bin/xcaddy` | `sudo rm /usr/local/bin/xcaddy` |
|
||||
| Go | `/usr/local/go/` | `sudo rm -rf /usr/local/go` |
|
||||
| Orama CLI | `/usr/local/bin/orama` | `sudo rm /usr/local/bin/orama` |
|
||||
|
||||
## Nuclear Clean (Remove Everything Including Binaries)
|
||||
|
||||
```bash
|
||||
# Run quick clean above first, then:
|
||||
sudo rm -f /usr/local/bin/rqlited
|
||||
sudo rm -f /usr/local/bin/ipfs
|
||||
sudo rm -f /usr/local/bin/ipfs-cluster-service
|
||||
sudo rm -f /usr/local/bin/olric-server
|
||||
sudo rm -f /usr/local/bin/coredns
|
||||
sudo rm -f /usr/local/bin/xcaddy
|
||||
sudo rm -f /usr/bin/caddy
|
||||
sudo rm -f /usr/local/bin/orama
|
||||
```
|
||||
|
||||
## Multi-Node Clean
|
||||
|
||||
To clean all nodes at once from your local machine:
|
||||
|
||||
```bash
|
||||
# Define your nodes
|
||||
NODES=(
|
||||
"ubuntu@141.227.165.168:password1"
|
||||
"ubuntu@141.227.165.154:password2"
|
||||
"ubuntu@141.227.156.51:password3"
|
||||
)
|
||||
|
||||
for entry in "${NODES[@]}"; do
|
||||
IFS=: read -r userhost pass <<< "$entry"
|
||||
echo "Cleaning $userhost..."
|
||||
sshpass -p "$pass" ssh -o StrictHostKeyChecking=no "$userhost" 'bash -s' << 'CLEAN'
|
||||
sudo systemctl stop debros-node debros-ipfs debros-ipfs-cluster debros-olric coredns caddy 2>/dev/null
|
||||
sudo systemctl disable debros-node debros-ipfs debros-ipfs-cluster debros-olric coredns caddy 2>/dev/null
|
||||
sudo rm -f /etc/systemd/system/debros-*.service /etc/systemd/system/coredns.service /etc/systemd/system/caddy.service /etc/systemd/system/orama-deploy-*.service
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl stop wg-quick@wg0 2>/dev/null
|
||||
sudo wg-quick down wg0 2>/dev/null
|
||||
sudo systemctl disable wg-quick@wg0 2>/dev/null
|
||||
sudo rm -f /etc/wireguard/wg0.conf
|
||||
sudo ufw --force reset && sudo ufw allow 22/tcp && sudo ufw --force enable
|
||||
sudo userdel -r debros 2>/dev/null
|
||||
sudo rm -rf /home/debros
|
||||
sudo rm -f /etc/sudoers.d/debros-access /etc/sudoers.d/debros-deployments /etc/sudoers.d/debros-wireguard
|
||||
sudo rm -rf /etc/coredns /etc/caddy /var/lib/caddy
|
||||
sudo rm -f /tmp/orama /tmp/network-source.tar.gz
|
||||
sudo rm -rf /tmp/network-extract /tmp/coredns-build /tmp/caddy-build
|
||||
echo "Done"
|
||||
CLEAN
|
||||
done
|
||||
```
|
||||
@ -199,15 +199,24 @@ func (o *Orchestrator) executeGenesisFlow() error {
|
||||
return fmt.Errorf("service creation failed: %w", err)
|
||||
}
|
||||
|
||||
// Phase 7: Seed DNS records
|
||||
// Phase 7: Seed DNS records (with retry — migrations may still be running)
|
||||
if o.flags.Nameserver && o.flags.BaseDomain != "" {
|
||||
fmt.Printf("\n🌐 Phase 7: Seeding DNS records...\n")
|
||||
fmt.Printf(" Waiting for RQLite to start (10s)...\n")
|
||||
time.Sleep(10 * time.Second)
|
||||
if err := o.setup.SeedDNSRecords(o.flags.BaseDomain, o.flags.VpsIP, o.peers); err != nil {
|
||||
fmt.Fprintf(os.Stderr, " ⚠️ Warning: Failed to seed DNS records: %v\n", err)
|
||||
} else {
|
||||
fmt.Printf(" ✓ DNS records seeded\n")
|
||||
var seedErr error
|
||||
for attempt := 1; attempt <= 6; attempt++ {
|
||||
waitSec := 5 * attempt
|
||||
fmt.Printf(" Waiting for RQLite + migrations (%ds, attempt %d/6)...\n", waitSec, attempt)
|
||||
time.Sleep(time.Duration(waitSec) * time.Second)
|
||||
seedErr = o.setup.SeedDNSRecords(o.flags.BaseDomain, o.flags.VpsIP, o.peers)
|
||||
if seedErr == nil {
|
||||
fmt.Printf(" ✓ DNS records seeded\n")
|
||||
break
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, " ⚠️ Attempt %d failed: %v\n", attempt, seedErr)
|
||||
}
|
||||
if seedErr != nil {
|
||||
fmt.Fprintf(os.Stderr, " ⚠️ Warning: DNS seeding failed after all attempts.\n")
|
||||
fmt.Fprintf(os.Stderr, " Records will self-heal via node heartbeat once running.\n")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -138,8 +138,8 @@ func (bi *BinaryInstaller) InstallCaddy() error {
|
||||
}
|
||||
|
||||
// ConfigureCaddy creates Caddy configuration files
|
||||
func (bi *BinaryInstaller) ConfigureCaddy(domain string, email string, acmeEndpoint string) error {
|
||||
return bi.caddy.Configure(domain, email, acmeEndpoint)
|
||||
func (bi *BinaryInstaller) ConfigureCaddy(domain string, email string, acmeEndpoint string, baseDomain string) error {
|
||||
return bi.caddy.Configure(domain, email, acmeEndpoint, baseDomain)
|
||||
}
|
||||
|
||||
// Mock system commands for testing (if needed)
|
||||
|
||||
@ -6,6 +6,7 @@ import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -158,15 +159,17 @@ func (ci *CaddyInstaller) Install() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Configure creates Caddy configuration files
|
||||
func (ci *CaddyInstaller) Configure(domain string, email string, acmeEndpoint string) error {
|
||||
// Configure creates Caddy configuration files.
|
||||
// baseDomain is optional — if provided (and different from domain), Caddy will also
|
||||
// serve traffic for the base domain and its wildcard (e.g., *.dbrs.space).
|
||||
func (ci *CaddyInstaller) Configure(domain string, email string, acmeEndpoint string, baseDomain string) error {
|
||||
configDir := "/etc/caddy"
|
||||
if err := os.MkdirAll(configDir, 0755); err != nil {
|
||||
return fmt.Errorf("failed to create config directory: %w", err)
|
||||
}
|
||||
|
||||
// Create Caddyfile
|
||||
caddyfile := ci.generateCaddyfile(domain, email, acmeEndpoint)
|
||||
caddyfile := ci.generateCaddyfile(domain, email, acmeEndpoint, baseDomain)
|
||||
if err := os.WriteFile(filepath.Join(configDir, "Caddyfile"), []byte(caddyfile), 0644); err != nil {
|
||||
return fmt.Errorf("failed to write Caddyfile: %w", err)
|
||||
}
|
||||
@ -364,32 +367,31 @@ require (
|
||||
`
|
||||
}
|
||||
|
||||
// generateCaddyfile creates the Caddyfile configuration
|
||||
func (ci *CaddyInstaller) generateCaddyfile(domain, email, acmeEndpoint string) string {
|
||||
return fmt.Sprintf(`{
|
||||
email %s
|
||||
}
|
||||
|
||||
*.%s {
|
||||
tls {
|
||||
// generateCaddyfile creates the Caddyfile configuration.
|
||||
// If baseDomain is provided and different from domain, Caddy also serves
|
||||
// the base domain and its wildcard (e.g., *.dbrs.space alongside *.node1.dbrs.space).
|
||||
func (ci *CaddyInstaller) generateCaddyfile(domain, email, acmeEndpoint, baseDomain string) string {
|
||||
tlsBlock := fmt.Sprintf(` tls {
|
||||
dns orama {
|
||||
endpoint %s
|
||||
}
|
||||
}
|
||||
reverse_proxy localhost:6001
|
||||
}
|
||||
}`, acmeEndpoint)
|
||||
|
||||
%s {
|
||||
tls {
|
||||
dns orama {
|
||||
endpoint %s
|
||||
}
|
||||
}
|
||||
reverse_proxy localhost:6001
|
||||
}
|
||||
var sb strings.Builder
|
||||
sb.WriteString(fmt.Sprintf("{\n email %s\n}\n", email))
|
||||
|
||||
:80 {
|
||||
reverse_proxy localhost:6001
|
||||
}
|
||||
`, email, domain, acmeEndpoint, domain, acmeEndpoint)
|
||||
// Node domain blocks (e.g., node1.dbrs.space, *.node1.dbrs.space)
|
||||
sb.WriteString(fmt.Sprintf("\n*.%s {\n%s\n reverse_proxy localhost:6001\n}\n", domain, tlsBlock))
|
||||
sb.WriteString(fmt.Sprintf("\n%s {\n%s\n reverse_proxy localhost:6001\n}\n", domain, tlsBlock))
|
||||
|
||||
// Base domain blocks (e.g., dbrs.space, *.dbrs.space) — for app routing
|
||||
if baseDomain != "" && baseDomain != domain {
|
||||
sb.WriteString(fmt.Sprintf("\n*.%s {\n%s\n reverse_proxy localhost:6001\n}\n", baseDomain, tlsBlock))
|
||||
sb.WriteString(fmt.Sprintf("\n%s {\n%s\n reverse_proxy localhost:6001\n}\n", baseDomain, tlsBlock))
|
||||
}
|
||||
|
||||
// HTTP fallback (handles plain HTTP and ACME challenges)
|
||||
sb.WriteString("\n:80 {\n reverse_proxy localhost:6001\n}\n")
|
||||
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
@ -557,7 +557,7 @@ func (ps *ProductionSetup) Phase4GenerateConfigs(peerAddresses []string, vpsIP s
|
||||
}
|
||||
email := "admin@" + caddyDomain
|
||||
acmeEndpoint := "http://localhost:6001/v1/internal/acme"
|
||||
if err := ps.binaryInstaller.ConfigureCaddy(caddyDomain, email, acmeEndpoint); err != nil {
|
||||
if err := ps.binaryInstaller.ConfigureCaddy(caddyDomain, email, acmeEndpoint, baseDomain); err != nil {
|
||||
ps.logf(" ⚠️ Caddy config warning: %v", err)
|
||||
} else {
|
||||
ps.logf(" ✓ Caddy config generated")
|
||||
@ -686,7 +686,8 @@ func (ps *ProductionSetup) Phase5CreateSystemdServices(enableHTTPS bool) error {
|
||||
}
|
||||
}
|
||||
|
||||
// Start services in dependency order
|
||||
// Restart services in dependency order (restart instead of start ensures
|
||||
// services pick up new configs even if already running from a previous install)
|
||||
ps.logf(" Starting services...")
|
||||
|
||||
// Start infrastructure first (IPFS, Olric, Anyone) - RQLite is managed internally by each node
|
||||
@ -705,9 +706,9 @@ func (ps *ProductionSetup) Phase5CreateSystemdServices(enableHTTPS bool) error {
|
||||
infraServices = append(infraServices, "debros-anyone-relay.service")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for _, svc := range infraServices {
|
||||
if err := ps.serviceController.StartService(svc); err != nil {
|
||||
if err := ps.serviceController.RestartService(svc); err != nil {
|
||||
ps.logf(" ⚠️ Failed to start %s: %v", svc, err)
|
||||
} else {
|
||||
ps.logf(" - %s started", svc)
|
||||
@ -718,14 +719,14 @@ func (ps *ProductionSetup) Phase5CreateSystemdServices(enableHTTPS bool) error {
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
// Start IPFS Cluster
|
||||
if err := ps.serviceController.StartService("debros-ipfs-cluster.service"); err != nil {
|
||||
if err := ps.serviceController.RestartService("debros-ipfs-cluster.service"); err != nil {
|
||||
ps.logf(" ⚠️ Failed to start debros-ipfs-cluster.service: %v", err)
|
||||
} else {
|
||||
ps.logf(" - debros-ipfs-cluster.service started")
|
||||
}
|
||||
|
||||
// Start node service (gateway is embedded in node, no separate service needed)
|
||||
if err := ps.serviceController.StartService("debros-node.service"); err != nil {
|
||||
if err := ps.serviceController.RestartService("debros-node.service"); err != nil {
|
||||
ps.logf(" ⚠️ Failed to start debros-node.service: %v", err)
|
||||
} else {
|
||||
ps.logf(" - debros-node.service started (with embedded gateway)")
|
||||
@ -735,14 +736,14 @@ func (ps *ProductionSetup) Phase5CreateSystemdServices(enableHTTPS bool) error {
|
||||
// Caddy depends on debros-node.service (gateway on :6001), so start after node
|
||||
if ps.isNameserver {
|
||||
if _, err := os.Stat("/usr/local/bin/coredns"); err == nil {
|
||||
if err := ps.serviceController.StartService("coredns.service"); err != nil {
|
||||
if err := ps.serviceController.RestartService("coredns.service"); err != nil {
|
||||
ps.logf(" ⚠️ Failed to start coredns.service: %v", err)
|
||||
} else {
|
||||
ps.logf(" - coredns.service started")
|
||||
}
|
||||
}
|
||||
if _, err := os.Stat("/usr/bin/caddy"); err == nil {
|
||||
if err := ps.serviceController.StartService("caddy.service"); err != nil {
|
||||
if err := ps.serviceController.RestartService("caddy.service"); err != nil {
|
||||
ps.logf(" ⚠️ Failed to start caddy.service: %v", err)
|
||||
} else {
|
||||
ps.logf(" - caddy.service started")
|
||||
|
||||
@ -216,8 +216,9 @@ func (ssg *SystemdServiceGenerator) GenerateNodeService() string {
|
||||
|
||||
return fmt.Sprintf(`[Unit]
|
||||
Description=DeBros Network Node
|
||||
After=debros-ipfs-cluster.service debros-olric.service
|
||||
After=debros-ipfs-cluster.service debros-olric.service wg-quick@wg0.service
|
||||
Wants=debros-ipfs-cluster.service debros-olric.service
|
||||
Requires=wg-quick@wg0.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
|
||||
@ -157,8 +157,11 @@ func (wp *WireGuardProvisioner) Enable() error {
|
||||
return fmt.Errorf("failed to enable wg-quick@wg0: %w\n%s", err, string(output))
|
||||
}
|
||||
|
||||
// Start now
|
||||
cmd = exec.Command("systemctl", "start", "wg-quick@wg0")
|
||||
// Use restart instead of start. wg-quick@wg0 is a oneshot service with
|
||||
// RemainAfterExit=yes, so "systemctl start" is a no-op if the service is
|
||||
// already in "active (exited)" state (e.g. from a previous install that
|
||||
// wasn't fully cleaned). "restart" always re-runs the ExecStart command.
|
||||
cmd = exec.Command("systemctl", "restart", "wg-quick@wg0")
|
||||
if output, err := cmd.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("failed to start wg-quick@wg0: %w\n%s", err, string(output))
|
||||
}
|
||||
|
||||
@ -118,12 +118,15 @@ func (n *Node) updateDNSHeartbeat(ctx context.Context) error {
|
||||
// ensureBaseDNSRecords ensures this node's IP is present in the base DNS records.
|
||||
// This provides self-healing: if records are missing (fresh install, DB reset),
|
||||
// the node recreates them on startup. Each node only manages its own IP entries.
|
||||
//
|
||||
// Records are created for BOTH the base domain (dbrs.space) and the node domain
|
||||
// (node1.dbrs.space). The base domain records enable round-robin load balancing
|
||||
// across all nodes. The node domain records enable direct node access.
|
||||
func (n *Node) ensureBaseDNSRecords(ctx context.Context) error {
|
||||
domain := n.config.Node.Domain
|
||||
if domain == "" {
|
||||
domain = n.config.HTTPGateway.BaseDomain
|
||||
}
|
||||
if domain == "" {
|
||||
baseDomain := n.config.HTTPGateway.BaseDomain
|
||||
nodeDomain := n.config.Node.Domain
|
||||
|
||||
if baseDomain == "" && nodeDomain == "" {
|
||||
return nil // No domain configured, skip
|
||||
}
|
||||
|
||||
@ -132,22 +135,32 @@ func (n *Node) ensureBaseDNSRecords(ctx context.Context) error {
|
||||
return fmt.Errorf("failed to determine node IP: %w", err)
|
||||
}
|
||||
|
||||
// Ensure trailing dot for FQDN format (as CoreDNS expects)
|
||||
fqdn := domain + "."
|
||||
wildcardFQDN := "*." + domain + "."
|
||||
|
||||
db := n.rqliteAdapter.GetSQLDB()
|
||||
|
||||
// Build list of A records to ensure
|
||||
var records []struct {
|
||||
fqdn string
|
||||
value string
|
||||
}
|
||||
|
||||
// Base domain records (e.g., dbrs.space, *.dbrs.space) — for round-robin across all nodes
|
||||
if baseDomain != "" {
|
||||
records = append(records,
|
||||
struct{ fqdn, value string }{baseDomain + ".", ipAddress},
|
||||
struct{ fqdn, value string }{"*." + baseDomain + ".", ipAddress},
|
||||
)
|
||||
}
|
||||
|
||||
// Node-specific records (e.g., node1.dbrs.space, *.node1.dbrs.space) — for direct node access
|
||||
if nodeDomain != "" && nodeDomain != baseDomain {
|
||||
records = append(records,
|
||||
struct{ fqdn, value string }{nodeDomain + ".", ipAddress},
|
||||
struct{ fqdn, value string }{"*." + nodeDomain + ".", ipAddress},
|
||||
)
|
||||
}
|
||||
|
||||
// Insert root A record and wildcard A record for this node's IP
|
||||
// ON CONFLICT DO NOTHING avoids duplicates (UNIQUE on fqdn, record_type, value)
|
||||
records := []struct {
|
||||
fqdn string
|
||||
value string
|
||||
}{
|
||||
{fqdn, ipAddress},
|
||||
{wildcardFQDN, ipAddress},
|
||||
}
|
||||
|
||||
for _, r := range records {
|
||||
query := `INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at)
|
||||
VALUES (?, 'A', ?, 300, 'system', 'system', TRUE, datetime('now'), datetime('now'))
|
||||
@ -158,12 +171,64 @@ func (n *Node) ensureBaseDNSRecords(ctx context.Context) error {
|
||||
}
|
||||
}
|
||||
|
||||
// Claim an NS slot if available (ns1, ns2, or ns3)
|
||||
n.claimNameserverSlot(ctx, domain, ipAddress)
|
||||
// Ensure SOA and NS records exist for the base domain (self-healing)
|
||||
if baseDomain != "" {
|
||||
n.ensureSOAAndNSRecords(ctx, baseDomain)
|
||||
}
|
||||
|
||||
// Claim an NS slot for the base domain (ns1/ns2/ns3)
|
||||
if baseDomain != "" {
|
||||
n.claimNameserverSlot(ctx, baseDomain, ipAddress)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ensureSOAAndNSRecords creates SOA and NS records for the base domain if they don't exist.
|
||||
// These are normally seeded during install Phase 7, but if that fails (e.g. migrations
|
||||
// not yet run), the heartbeat self-heals them here.
|
||||
func (n *Node) ensureSOAAndNSRecords(ctx context.Context, baseDomain string) {
|
||||
db := n.rqliteAdapter.GetSQLDB()
|
||||
fqdn := baseDomain + "."
|
||||
|
||||
// Check if SOA exists
|
||||
var count int
|
||||
err := db.QueryRowContext(ctx,
|
||||
`SELECT COUNT(*) FROM dns_records WHERE fqdn = ? AND record_type = 'SOA'`, fqdn,
|
||||
).Scan(&count)
|
||||
if err != nil || count > 0 {
|
||||
return // SOA exists or query failed, skip
|
||||
}
|
||||
|
||||
n.logger.ComponentInfo(logging.ComponentNode, "SOA/NS records missing, self-healing",
|
||||
zap.String("domain", baseDomain))
|
||||
|
||||
// Create SOA record
|
||||
soaValue := fmt.Sprintf("ns1.%s. admin.%s. %d 3600 1800 604800 300",
|
||||
baseDomain, baseDomain, time.Now().Unix())
|
||||
if _, err := db.ExecContext(ctx,
|
||||
`INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at)
|
||||
VALUES (?, 'SOA', ?, 300, 'system', 'system', TRUE, datetime('now'), datetime('now'))
|
||||
ON CONFLICT(fqdn, record_type, value) DO NOTHING`,
|
||||
fqdn, soaValue,
|
||||
); err != nil {
|
||||
n.logger.ComponentWarn(logging.ComponentNode, "Failed to create SOA record", zap.Error(err))
|
||||
}
|
||||
|
||||
// Create NS records (ns1, ns2, ns3)
|
||||
for i := 1; i <= 3; i++ {
|
||||
nsValue := fmt.Sprintf("ns%d.%s.", i, baseDomain)
|
||||
if _, err := db.ExecContext(ctx,
|
||||
`INSERT INTO dns_records (fqdn, record_type, value, ttl, namespace, created_by, is_active, created_at, updated_at)
|
||||
VALUES (?, 'NS', ?, 300, 'system', 'system', TRUE, datetime('now'), datetime('now'))
|
||||
ON CONFLICT(fqdn, record_type, value) DO NOTHING`,
|
||||
fqdn, nsValue,
|
||||
); err != nil {
|
||||
n.logger.ComponentWarn(logging.ComponentNode, "Failed to create NS record", zap.Error(err))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// claimNameserverSlot attempts to claim an available NS hostname (ns1/ns2/ns3) for this node.
|
||||
// If the node already has a slot, it updates the IP. If no slot is available, it does nothing.
|
||||
func (n *Node) claimNameserverSlot(ctx context.Context, domain, ipAddress string) {
|
||||
@ -236,11 +301,11 @@ func (n *Node) cleanupStaleNodeRecords(ctx context.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
domain := n.config.Node.Domain
|
||||
if domain == "" {
|
||||
domain = n.config.HTTPGateway.BaseDomain
|
||||
baseDomain := n.config.HTTPGateway.BaseDomain
|
||||
if baseDomain == "" {
|
||||
baseDomain = n.config.Node.Domain
|
||||
}
|
||||
if domain == "" {
|
||||
if baseDomain == "" {
|
||||
return
|
||||
}
|
||||
|
||||
@ -255,8 +320,12 @@ func (n *Node) cleanupStaleNodeRecords(ctx context.Context) {
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
fqdn := domain + "."
|
||||
wildcardFQDN := "*." + domain + "."
|
||||
// Build all FQDNs to clean: base domain + node domain
|
||||
var fqdnsToClean []string
|
||||
fqdnsToClean = append(fqdnsToClean, baseDomain+".", "*."+baseDomain+".")
|
||||
if n.config.Node.Domain != "" && n.config.Node.Domain != baseDomain {
|
||||
fqdnsToClean = append(fqdnsToClean, n.config.Node.Domain+".", "*."+n.config.Node.Domain+".")
|
||||
}
|
||||
|
||||
for rows.Next() {
|
||||
var nodeID, ip string
|
||||
@ -270,7 +339,7 @@ func (n *Node) cleanupStaleNodeRecords(ctx context.Context) {
|
||||
}
|
||||
|
||||
// Remove the dead node's A records from round-robin
|
||||
for _, f := range []string{fqdn, wildcardFQDN} {
|
||||
for _, f := range fqdnsToClean {
|
||||
if _, err := db.ExecContext(ctx, `DELETE FROM dns_records WHERE fqdn = ? AND record_type = 'A' AND value = ? AND namespace = 'system'`, f, ip); err != nil {
|
||||
n.logger.ComponentWarn(logging.ComponentNode, "Failed to remove stale DNS record",
|
||||
zap.String("fqdn", f), zap.String("ip", ip), zap.Error(err))
|
||||
@ -284,7 +353,7 @@ func (n *Node) cleanupStaleNodeRecords(ctx context.Context) {
|
||||
|
||||
// Remove glue records for this node's IP (ns1.domain., ns2.domain., ns3.domain.)
|
||||
for _, ns := range []string{"ns1", "ns2", "ns3"} {
|
||||
nsFQDN := ns + "." + domain + "."
|
||||
nsFQDN := ns + "." + baseDomain + "."
|
||||
if _, err := db.ExecContext(ctx,
|
||||
`DELETE FROM dns_records WHERE fqdn = ? AND record_type = 'A' AND value = ? AND namespace = 'system'`,
|
||||
nsFQDN, ip,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user