diff --git a/Makefile b/Makefile index a94a4dc..31a20d5 100644 --- a/Makefile +++ b/Makefile @@ -63,7 +63,7 @@ test-e2e-quick: .PHONY: build clean test deps tidy fmt vet lint install-hooks redeploy-devnet redeploy-testnet release health -VERSION := 0.107.0 +VERSION := 0.107.2 COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown) DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ) LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)' diff --git a/docs/DEVNET_INSTALL.md b/docs/DEVNET_INSTALL.md index 7467aea..bf1aab3 100644 --- a/docs/DEVNET_INSTALL.md +++ b/docs/DEVNET_INSTALL.md @@ -41,7 +41,7 @@ Install nodes **one at a time**, waiting for each to complete before starting th ```bash # SSH: @ -sudo orama install \ +sudo orama node install \ --vps-ip \ --domain \ --base-domain \ @@ -50,7 +50,7 @@ sudo orama install \ After ns1 is installed, generate invite tokens: ```bash -orama invite --expiry 24h +sudo orama node invite --expiry 24h ``` ## ns2 - Nameserver + Relay @@ -58,7 +58,7 @@ orama invite --expiry 24h ```bash # SSH: @ -sudo orama install \ +sudo orama node install \ --join http:// --token \ --vps-ip \ --domain \ @@ -68,8 +68,7 @@ sudo orama install \ --anyone-nickname \ --anyone-wallet \ --anyone-contact "" \ - --anyone-family ",,..." \ - --anyone-bandwidth 30 + --anyone-family ",,..." ``` ## ns3 - Nameserver + Relay @@ -77,7 +76,7 @@ sudo orama install \ ```bash # SSH: @ -sudo orama install \ +sudo orama node install \ --join http:// --token \ --vps-ip \ --domain \ @@ -87,27 +86,25 @@ sudo orama install \ --anyone-nickname \ --anyone-wallet \ --anyone-contact "" \ - --anyone-family ",,..." \ - --anyone-bandwidth 30 + --anyone-family ",,..." ``` ## node4 - Non-Nameserver + Relay +Domain is auto-generated (e.g., `node-a3f8k2.`). No `--domain` flag needed. + ```bash # SSH: @ -sudo orama install \ +sudo orama node install \ --join http:// --token \ --vps-ip \ - --domain node4. \ --base-domain \ - --skip-checks \ --anyone-relay --anyone-migrate \ --anyone-nickname \ --anyone-wallet \ --anyone-contact "" \ - --anyone-family ",,..." \ - --anyone-bandwidth 30 + --anyone-family ",,..." ``` ## node5 - Non-Nameserver + Relay @@ -115,18 +112,15 @@ sudo orama install \ ```bash # SSH: @ -sudo orama install \ +sudo orama node install \ --join http:// --token \ --vps-ip \ - --domain node5. \ --base-domain \ - --skip-checks \ --anyone-relay --anyone-migrate \ --anyone-nickname \ --anyone-wallet \ --anyone-contact "" \ - --anyone-family ",,..." \ - --anyone-bandwidth 30 + --anyone-family ",,..." ``` ## node6 - Non-Nameserver (No Anyone Relay) @@ -134,12 +128,10 @@ sudo orama install \ ```bash # SSH: @ -sudo orama install \ +sudo orama node install \ --join http:// --token \ --vps-ip \ - --domain node6. \ - --base-domain \ - --skip-checks + --base-domain ``` ## Verification @@ -147,13 +139,14 @@ sudo orama install \ After all nodes are installed, verify cluster health: ```bash -# Check RQLite cluster (from any node) +# Full cluster report (from local machine) +./bin/orama monitor report --env devnet + +# Single node health +./bin/orama monitor report --env devnet --node + +# Or manually from any VPS: curl -s http://localhost:5001/status | jq -r '.store.raft.state, .store.raft.num_peers' -# Should show: Leader (on one node) and N-1 peers - -# Check gateway health curl -s http://localhost:6001/health - -# Check Anyone relay (on nodes with relays) systemctl status orama-anyone-relay ``` diff --git a/docs/DEV_DEPLOY.md b/docs/DEV_DEPLOY.md index ec7a8bf..c33537d 100644 --- a/docs/DEV_DEPLOY.md +++ b/docs/DEV_DEPLOY.md @@ -81,19 +81,33 @@ for ip in ; do done # 5. Find the RQLite leader (upgrade this one LAST) -ssh ubuntu@ 'curl -s http://localhost:5001/status | jq -r .store.raft.state' +orama monitor report --env +# Check "rqlite_leader" in summary output # 6. Upgrade FOLLOWER nodes one at a time ssh ubuntu@ 'sudo orama node stop && sudo orama node upgrade --restart' -# Wait for rejoin before proceeding to next node -ssh ubuntu@ 'curl -s http://localhost:5001/status | jq -r .store.raft.num_peers' -# Should show expected number of peers (N-1) +# IMPORTANT: Verify FULL health before proceeding to next node: +orama monitor report --env --node +# Check: +# - All services active, 0 restart loops +# - RQLite: Follower state, applied_index matches cluster +# - All RQLite peers reachable (no partition alerts) +# - WireGuard peers connected with recent handshakes +# Only proceed to next node after ALL checks pass. +# +# NOTE: After restarting a node, other nodes may briefly report it as +# "unreachable" with "broken pipe" errors. This is normal — Raft TCP +# connections need ~1-2 minutes to re-establish. Wait and re-check +# before escalating. # Repeat for each follower... # 7. Upgrade the LEADER node last ssh ubuntu@ 'sudo orama node stop && sudo orama node upgrade --restart' + +# Verify the new leader was elected and cluster is fully healthy: +orama monitor report --env ``` #### What NOT to Do @@ -140,7 +154,7 @@ To deploy to all nodes, repeat steps 3-5 (dev) or 3-4 (production) for each VPS | Flag | Description | |------|-------------| | `--vps-ip ` | VPS public IP address (required) | -| `--domain ` | Domain for HTTPS certificates. Nameserver nodes use the base domain (e.g., `example.com`); non-nameserver nodes use a subdomain (e.g., `node-4.example.com`) | +| `--domain ` | Domain for HTTPS certificates. Required for nameserver nodes (use the base domain, e.g., `example.com`). Auto-generated for non-nameserver nodes if omitted (e.g., `node-a3f8k2.example.com`) | | `--base-domain ` | Base domain for deployment routing (e.g., example.com) | | `--nameserver` | Configure this node as a nameserver (CoreDNS + Caddy) | | `--join ` | Join existing cluster via HTTPS URL (e.g., `https://node1.example.com`) | @@ -242,13 +256,16 @@ sudo orama node install --vps-ip 1.2.3.4 --domain example.com \ --base-domain example.com --nameserver # 2. On genesis node, generate an invite -orama node invite +orama node invite --expiry 24h # Output: sudo orama node install --join https://example.com --token --vps-ip -# 3. On the new node, run the printed command -# Nameserver nodes use the base domain; non-nameserver nodes use subdomains (e.g., node-4.example.com) -sudo orama node install --join https://example.com --token abc123... \ +# 3a. Join as nameserver (requires --domain set to base domain) +sudo orama node install --join http://1.2.3.4 --token abc123... \ --vps-ip 5.6.7.8 --domain example.com --base-domain example.com --nameserver + +# 3b. Join as regular node (domain auto-generated, no --domain needed) +sudo orama node install --join http://1.2.3.4 --token abc123... \ + --vps-ip 5.6.7.8 --base-domain example.com ``` The join flow establishes a WireGuard VPN tunnel before starting cluster services. diff --git a/pkg/cli/production/commands.go b/pkg/cli/production/commands.go index a15c8e2..6ff03e8 100644 --- a/pkg/cli/production/commands.go +++ b/pkg/cli/production/commands.go @@ -80,7 +80,7 @@ func ShowHelp() { fmt.Printf(" --interactive - Launch interactive TUI wizard\n") fmt.Printf(" --force - Reconfigure all settings\n") fmt.Printf(" --vps-ip IP - VPS public IP address (required)\n") - fmt.Printf(" --domain DOMAIN - Domain for this node (e.g., node-1.orama.network)\n") + fmt.Printf(" --domain DOMAIN - Domain for HTTPS (auto-generated if omitted)\n") fmt.Printf(" --peers ADDRS - Comma-separated peer multiaddrs (for joining cluster)\n") fmt.Printf(" --join ADDR - RQLite join address IP:port (for joining cluster)\n") fmt.Printf(" --cluster-secret HEX - 64-hex cluster secret (required when joining)\n") diff --git a/pkg/cli/production/install/flags.go b/pkg/cli/production/install/flags.go index 5b55e64..3e4788c 100644 --- a/pkg/cli/production/install/flags.go +++ b/pkg/cli/production/install/flags.go @@ -52,7 +52,7 @@ func ParseFlags(args []string) (*Flags, error) { flags := &Flags{} fs.StringVar(&flags.VpsIP, "vps-ip", "", "Public IP of this VPS (required)") - fs.StringVar(&flags.Domain, "domain", "", "Domain name for HTTPS (optional, e.g. gateway.example.com)") + fs.StringVar(&flags.Domain, "domain", "", "Domain for HTTPS (auto-generated for non-nameserver nodes if omitted)") fs.StringVar(&flags.BaseDomain, "base-domain", "", "Base domain for deployment routing (e.g., dbrs.space)") fs.BoolVar(&flags.Force, "force", false, "Force reconfiguration even if already installed") fs.BoolVar(&flags.DryRun, "dry-run", false, "Show what would be done without making changes") diff --git a/pkg/cli/production/install/orchestrator.go b/pkg/cli/production/install/orchestrator.go index 1ee2bbb..f3ecd67 100644 --- a/pkg/cli/production/install/orchestrator.go +++ b/pkg/cli/production/install/orchestrator.go @@ -2,6 +2,7 @@ package install import ( "bufio" + "crypto/rand" "crypto/tls" "encoding/json" "fmt" @@ -295,6 +296,12 @@ func (o *Orchestrator) executeJoinFlow() error { } fmt.Printf(" ✓ Secrets saved\n") + // Auto-generate domain for non-nameserver joining nodes + if o.flags.Domain == "" && !o.flags.Nameserver && joinResp.BaseDomain != "" { + o.flags.Domain = generateNodeDomain(joinResp.BaseDomain) + fmt.Printf("\n🌐 Auto-generated domain: %s\n", o.flags.Domain) + } + // Step 7: Generate configs using WG IP as advertise address // All inter-node communication uses WireGuard IPs, not public IPs fmt.Printf("\nâš™ī¸ Generating configurations...\n") @@ -537,3 +544,17 @@ func (o *Orchestrator) installNamespaceTemplates() error { return nil } + +// generateNodeDomain creates a random subdomain like "node-a3f8k2.example.com" +func generateNodeDomain(baseDomain string) string { + const chars = "abcdefghijklmnopqrstuvwxyz0123456789" + b := make([]byte, 6) + if _, err := rand.Read(b); err != nil { + // Fallback to timestamp-based + return fmt.Sprintf("node-%06x.%s", time.Now().UnixNano()%0xffffff, baseDomain) + } + for i := range b { + b[i] = chars[int(b[i])%len(chars)] + } + return fmt.Sprintf("node-%s.%s", string(b), baseDomain) +} diff --git a/pkg/cli/production/lifecycle/stop.go b/pkg/cli/production/lifecycle/stop.go index 9db0264..0e7f289 100644 --- a/pkg/cli/production/lifecycle/stop.go +++ b/pkg/cli/production/lifecycle/stop.go @@ -59,11 +59,13 @@ func HandleStopWithFlags(force bool) { {"coredns", "caddy"}, // 5. Stop DNS/TLS last } - // First, disable all services to prevent auto-restart - disableArgs := []string{"disable"} - disableArgs = append(disableArgs, services...) - if err := exec.Command("systemctl", disableArgs...).Run(); err != nil { - fmt.Printf(" Warning: Failed to disable some services: %v\n", err) + // Mask all services to immediately prevent Restart=always from reviving them. + // Unlike "disable" (which only removes boot symlinks), "mask" links the unit + // to /dev/null so systemd cannot start it at all. Unmasked by "orama node start". + maskArgs := []string{"mask"} + maskArgs = append(maskArgs, services...) + if err := exec.Command("systemctl", maskArgs...).Run(); err != nil { + fmt.Printf(" Warning: Failed to mask some services: %v\n", err) } // Stop services in order with brief pauses between groups @@ -135,31 +137,16 @@ func HandleStopWithFlags(force bool) { } } - // Disable the service to prevent it from auto-starting on boot - enabled, err := utils.IsServiceEnabled(svc) - if err != nil { - fmt.Printf(" âš ī¸ Unable to check if %s is enabled: %v\n", svc, err) - // Continue anyway - try to disable - } - if enabled { - if err := exec.Command("systemctl", "disable", svc).Run(); err != nil { - fmt.Printf(" âš ī¸ Failed to disable %s: %v\n", svc, err) - hadError = true - } else { - fmt.Printf(" ✓ Disabled %s (will not auto-start on boot)\n", svc) - } - } else { - fmt.Printf(" â„šī¸ %s already disabled\n", svc) - } + // Service is already masked (prevents both restart and boot start). + // No additional disable needed. } if hadError { - fmt.Fprintf(os.Stderr, "\nâš ī¸ Some services may still be restarting due to Restart=always\n") + fmt.Fprintf(os.Stderr, "\nâš ī¸ Some services could not be stopped cleanly\n") fmt.Fprintf(os.Stderr, " Check status with: systemctl list-units 'orama-*'\n") - fmt.Fprintf(os.Stderr, " If services are still restarting, they may need manual intervention\n") } else { - fmt.Printf("\n✅ All services stopped and disabled (will not auto-start on boot)\n") - fmt.Printf(" Use 'orama node start' to start and re-enable services\n") + fmt.Printf("\n✅ All services stopped and masked (will not auto-start on boot)\n") + fmt.Printf(" Use 'orama node start' to unmask and start services\n") } } diff --git a/pkg/environments/production/services.go b/pkg/environments/production/services.go index 0defaf8..459f68b 100644 --- a/pkg/environments/production/services.go +++ b/pkg/environments/production/services.go @@ -244,7 +244,8 @@ WantedBy=multi-user.target `, ssg.oramaHome, ssg.oramaDir, logFile) } -// GenerateAnyoneClientService generates the Anyone Client SOCKS5 proxy systemd unit +// GenerateAnyoneClientService generates the Anyone Client SOCKS5 proxy systemd unit. +// Uses the same anon binary as the relay, but with a client-only config (SocksPort only, no relay). func (ssg *SystemdServiceGenerator) GenerateAnyoneClientService() string { logFile := filepath.Join(ssg.oramaDir, "logs", "anyone-client.log") @@ -255,14 +256,13 @@ Wants=network-online.target [Service] Type=simple -Environment=HOME=%[1]s -Environment=PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/lib/node_modules/.bin -WorkingDirectory=%[1]s -ExecStart=/usr/bin/npx anyone-client -Restart=always +User=debian-anon +Group=debian-anon +ExecStart=/usr/bin/anon -f /etc/anon/anonrc +Restart=on-failure RestartSec=5 -StandardOutput=append:%[2]s -StandardError=append:%[2]s +StandardOutput=append:%[1]s +StandardError=append:%[1]s SyslogIdentifier=anyone-client PrivateTmp=yes @@ -273,7 +273,7 @@ MemoryMax=1G [Install] WantedBy=multi-user.target -`, ssg.oramaHome, logFile, ssg.oramaDir) +`, logFile) } // GenerateAnyoneRelayService generates the Anyone Relay operator systemd unit diff --git a/pkg/environments/templates/systemd_node.service b/pkg/environments/templates/systemd_node.service index edbcab6..bb57e0d 100644 --- a/pkg/environments/templates/systemd_node.service +++ b/pkg/environments/templates/systemd_node.service @@ -11,6 +11,9 @@ Environment=HOME={{.HomeDir}} ExecStart={{.HomeDir}}/bin/orama-node --config {{.OramaDir}}/configs/{{.ConfigFile}} Restart=always RestartSec=5 +TimeoutStopSec=45s +KillMode=mixed +KillSignal=SIGTERM StandardOutput=journal StandardError=journal SyslogIdentifier=orama-node-{{.NodeType}} diff --git a/scripts/clean-testnet.sh b/scripts/clean-testnet.sh index 5c4eb83..1b5ddbe 100755 --- a/scripts/clean-testnet.sh +++ b/scripts/clean-testnet.sh @@ -1,11 +1,13 @@ #!/usr/bin/env bash # -# Clean all testnet nodes for fresh reinstall. +# Clean testnet nodes for fresh reinstall. # Preserves Anyone relay keys (/var/lib/anon/) for --anyone-migrate. # DOES NOT TOUCH DEVNET NODES. # -# Usage: scripts/clean-testnet.sh [--nuclear] +# Usage: scripts/clean-testnet.sh [--nuclear] [IP ...] # --nuclear Also remove shared binaries (rqlited, ipfs, coredns, caddy, etc.) +# IP ... Optional: only clean specific nodes by IP (e.g. 62.72.44.87 51.178.84.172) +# If no IPs given, cleans ALL testnet nodes. # set -euo pipefail @@ -16,7 +18,14 @@ CONF="$ROOT_DIR/scripts/remote-nodes.conf" command -v sshpass >/dev/null 2>&1 || { echo "ERROR: sshpass not installed (brew install sshpass / apt install sshpass)"; exit 1; } NUCLEAR=false -[[ "${1:-}" == "--nuclear" ]] && NUCLEAR=true +TARGET_IPS=() +for arg in "$@"; do + if [[ "$arg" == "--nuclear" ]]; then + NUCLEAR=true + else + TARGET_IPS+=("$arg") + fi +done SSH_OPTS=(-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=10 -o LogLevel=ERROR -o PubkeyAuthentication=no) @@ -130,24 +139,41 @@ while IFS='|' read -r env hostspec pass role key; do env="$(echo "$env" | xargs)" [[ "$env" != "testnet" ]] && continue + # If target IPs specified, only include matching nodes + if [[ ${#TARGET_IPS[@]} -gt 0 ]]; then + node_ip="${hostspec#*@}" + matched=false + for tip in "${TARGET_IPS[@]}"; do + [[ "$tip" == "$node_ip" ]] && matched=true && break + done + $matched || continue + fi + hosts+=("$hostspec") passes+=("$pass") users+=("${hostspec%%@*}") done < "$CONF" if [[ ${#hosts[@]} -eq 0 ]]; then - echo "ERROR: No testnet nodes found in $CONF" + if [[ ${#TARGET_IPS[@]} -gt 0 ]]; then + echo "ERROR: No testnet nodes found matching: ${TARGET_IPS[*]}" + else + echo "ERROR: No testnet nodes found in $CONF" + fi exit 1 fi -echo "== clean-testnet.sh — ${#hosts[@]} testnet nodes ==" +if [[ ${#TARGET_IPS[@]} -gt 0 ]]; then + echo "== clean-testnet.sh — ${#hosts[@]} selected node(s) ==" +else + echo "== clean-testnet.sh — ${#hosts[@]} testnet nodes (ALL) ==" +fi for i in "${!hosts[@]}"; do echo " [$((i+1))] ${hosts[$i]}" done echo "" -echo "This will CLEAN all testnet nodes (stop services, remove data)." +echo "This will CLEAN the above node(s) (stop services, remove data)." echo "Anyone relay keys (/var/lib/anon/) will be PRESERVED." -echo "Devnet nodes will NOT be touched." $NUCLEAR && echo "Nuclear mode: shared binaries will also be removed." echo "" read -rp "Type 'yes' to continue: " confirm diff --git a/systemd/orama-namespace-rqlite@.service b/systemd/orama-namespace-rqlite@.service index 3e982b4..09f2330 100644 --- a/systemd/orama-namespace-rqlite@.service +++ b/systemd/orama-namespace-rqlite@.service @@ -22,7 +22,7 @@ ExecStart=/bin/sh -c 'exec /usr/local/bin/rqlited \ /opt/orama/.orama/data/namespaces/%i/rqlite/${NODE_ID}' # Graceful shutdown -TimeoutStopSec=30s +TimeoutStopSec=60s KillMode=mixed KillSignal=SIGTERM