package sandbox import ( "fmt" "os" "os/exec" "path/filepath" "strings" "time" "github.com/DeBrosOfficial/network/pkg/cli/remotessh" "github.com/DeBrosOfficial/network/pkg/inspector" ) // Create orchestrates the creation of a new sandbox cluster. func Create(name string) error { cfg, err := LoadConfig() if err != nil { return err } // Resolve wallet SSH key once for all phases sshKeyPath, cleanup, err := resolveVaultKeyOnce(cfg.SSHKey.VaultTarget) if err != nil { return fmt.Errorf("prepare SSH key: %w", err) } defer cleanup() // Check for existing active sandbox active, err := FindActiveSandbox() if err != nil { return err } if active != nil { return fmt.Errorf("sandbox %q is already active (status: %s)\nDestroy it first: orama sandbox destroy --name %s", active.Name, active.Status, active.Name) } // Generate name if not provided if name == "" { name = GenerateName() } fmt.Printf("Creating sandbox %q (%s, %d nodes)\n\n", name, cfg.Domain, 5) client := NewHetznerClient(cfg.HetznerAPIToken) state := &SandboxState{ Name: name, CreatedAt: time.Now().UTC(), Domain: cfg.Domain, Status: StatusCreating, } // Phase 1: Provision servers fmt.Println("Phase 1: Provisioning servers...") if err := phase1ProvisionServers(client, cfg, state); err != nil { cleanupFailedCreate(client, state) return fmt.Errorf("provision servers: %w", err) } SaveState(state) // Phase 2: Assign floating IPs fmt.Println("\nPhase 2: Assigning floating IPs...") if err := phase2AssignFloatingIPs(client, cfg, state, sshKeyPath); err != nil { return fmt.Errorf("assign floating IPs: %w", err) } SaveState(state) // Phase 3: Upload binary archive fmt.Println("\nPhase 3: Uploading binary archive...") if err := phase3UploadArchive(state, sshKeyPath); err != nil { return fmt.Errorf("upload archive: %w", err) } // Phase 4: Install genesis node fmt.Println("\nPhase 4: Installing genesis node...") tokens, err := phase4InstallGenesis(cfg, state, sshKeyPath) if err != nil { state.Status = StatusError SaveState(state) return fmt.Errorf("install genesis: %w", err) } // Phase 5: Join remaining nodes fmt.Println("\nPhase 5: Joining remaining nodes...") if err := phase5JoinNodes(cfg, state, tokens, sshKeyPath); err != nil { state.Status = StatusError SaveState(state) return fmt.Errorf("join nodes: %w", err) } // Phase 6: Verify cluster fmt.Println("\nPhase 6: Verifying cluster...") phase6Verify(cfg, state, sshKeyPath) state.Status = StatusRunning SaveState(state) printCreateSummary(cfg, state) return nil } // resolveVaultKeyOnce resolves a wallet SSH key to a temp file. // Returns the key path, cleanup function, and any error. func resolveVaultKeyOnce(vaultTarget string) (string, func(), error) { node := inspector.Node{User: "root", Host: "resolve-only", VaultTarget: vaultTarget} nodes := []inspector.Node{node} cleanup, err := remotessh.PrepareNodeKeys(nodes) if err != nil { return "", func() {}, err } return nodes[0].SSHKey, cleanup, nil } // phase1ProvisionServers creates 5 Hetzner servers in parallel. func phase1ProvisionServers(client *HetznerClient, cfg *Config, state *SandboxState) error { type serverResult struct { index int server *HetznerServer err error } results := make(chan serverResult, 5) for i := 0; i < 5; i++ { go func(idx int) { role := "node" if idx < 2 { role = "nameserver" } serverName := fmt.Sprintf("sbx-%s-%d", state.Name, idx+1) labels := map[string]string{ "orama-sandbox": state.Name, "orama-sandbox-role": role, } req := CreateServerRequest{ Name: serverName, ServerType: cfg.ServerType, Image: "ubuntu-24.04", Location: cfg.Location, SSHKeys: []int64{cfg.SSHKey.HetznerID}, Labels: labels, } if cfg.FirewallID > 0 { req.Firewalls = []struct { Firewall int64 `json:"firewall"` }{{Firewall: cfg.FirewallID}} } srv, err := client.CreateServer(req) results <- serverResult{index: idx, server: srv, err: err} }(i) } servers := make([]ServerState, 5) var firstErr error for i := 0; i < 5; i++ { r := <-results if r.err != nil { if firstErr == nil { firstErr = fmt.Errorf("server %d: %w", r.index+1, r.err) } continue } fmt.Printf(" Created %s (ID: %d, initializing...)\n", r.server.Name, r.server.ID) role := "node" if r.index < 2 { role = "nameserver" } servers[r.index] = ServerState{ ID: r.server.ID, Name: r.server.Name, Role: role, } } state.Servers = servers // populate before returning so cleanup can delete created servers if firstErr != nil { return firstErr } // Wait for all servers to reach "running" fmt.Print(" Waiting for servers to boot...") for i := range servers { srv, err := client.WaitForServer(servers[i].ID, 3*time.Minute) if err != nil { return fmt.Errorf("wait for %s: %w", servers[i].Name, err) } servers[i].IP = srv.PublicNet.IPv4.IP fmt.Print(".") } fmt.Println(" OK") // Assign floating IPs to nameserver entries if len(cfg.FloatingIPs) >= 2 { servers[0].FloatingIP = cfg.FloatingIPs[0].IP servers[1].FloatingIP = cfg.FloatingIPs[1].IP } state.Servers = servers for _, srv := range servers { fmt.Printf(" %s: %s (%s)\n", srv.Name, srv.IP, srv.Role) } return nil } // phase2AssignFloatingIPs assigns floating IPs and configures loopback. func phase2AssignFloatingIPs(client *HetznerClient, cfg *Config, state *SandboxState, sshKeyPath string) error { for i := 0; i < 2 && i < len(cfg.FloatingIPs) && i < len(state.Servers); i++ { fip := cfg.FloatingIPs[i] srv := state.Servers[i] // Unassign if currently assigned elsewhere (ignore "not assigned" errors) fmt.Printf(" Assigning %s to %s...\n", fip.IP, srv.Name) if err := client.UnassignFloatingIP(fip.ID); err != nil { // Log but continue — may fail if not currently assigned, which is fine fmt.Printf(" Note: unassign %s: %v (continuing)\n", fip.IP, err) } if err := client.AssignFloatingIP(fip.ID, srv.ID); err != nil { return fmt.Errorf("assign %s to %s: %w", fip.IP, srv.Name, err) } // Configure floating IP on the server's loopback interface // Hetzner floating IPs require this: ip addr add /32 dev lo node := inspector.Node{ User: "root", Host: srv.IP, SSHKey: sshKeyPath, } // Wait for SSH to be ready on freshly booted servers if err := waitForSSH(node, 5*time.Minute); err != nil { return fmt.Errorf("SSH not ready on %s: %w", srv.Name, err) } cmd := fmt.Sprintf("ip addr add %s/32 dev lo 2>/dev/null || true", fip.IP) if err := remotessh.RunSSHStreaming(node, cmd, remotessh.WithNoHostKeyCheck()); err != nil { return fmt.Errorf("configure loopback on %s: %w", srv.Name, err) } } return nil } // waitForSSH polls until SSH is responsive on the node. func waitForSSH(node inspector.Node, timeout time.Duration) error { deadline := time.Now().Add(timeout) for time.Now().Before(deadline) { _, err := runSSHOutput(node, "echo ok") if err == nil { return nil } time.Sleep(3 * time.Second) } return fmt.Errorf("timeout after %s", timeout) } // phase3UploadArchive uploads the binary archive to the genesis node, then fans out // to the remaining nodes server-to-server (much faster than uploading from local machine). func phase3UploadArchive(state *SandboxState, sshKeyPath string) error { archivePath := findNewestArchive() if archivePath == "" { fmt.Println(" No binary archive found, run `orama build` first") return fmt.Errorf("no binary archive found in /tmp/ (run `orama build` first)") } info, _ := os.Stat(archivePath) fmt.Printf(" Archive: %s (%s)\n", filepath.Base(archivePath), formatBytes(info.Size())) if err := fanoutArchive(state.Servers, sshKeyPath, archivePath); err != nil { return err } fmt.Println(" All nodes ready") return nil } // phase4InstallGenesis installs the genesis node and generates invite tokens. func phase4InstallGenesis(cfg *Config, state *SandboxState, sshKeyPath string) ([]string, error) { genesis := state.GenesisServer() node := inspector.Node{User: "root", Host: genesis.IP, SSHKey: sshKeyPath} // Install genesis installCmd := fmt.Sprintf("/opt/orama/bin/orama node install --vps-ip %s --domain %s --base-domain %s --nameserver --skip-checks", genesis.IP, cfg.Domain, cfg.Domain) fmt.Printf(" Installing on %s (%s)...\n", genesis.Name, genesis.IP) if err := remotessh.RunSSHStreaming(node, installCmd, remotessh.WithNoHostKeyCheck()); err != nil { return nil, fmt.Errorf("install genesis: %w", err) } // Wait for RQLite leader fmt.Print(" Waiting for RQLite leader...") if err := waitForRQLiteHealth(node, 3*time.Minute); err != nil { return nil, fmt.Errorf("genesis health: %w", err) } fmt.Println(" OK") // Generate invite tokens (one per remaining node) fmt.Print(" Generating invite tokens...") remaining := len(state.Servers) - 1 tokens := make([]string, remaining) for i := 0; i < remaining; i++ { token, err := generateInviteToken(node) if err != nil { return nil, fmt.Errorf("generate invite token %d: %w", i+1, err) } tokens[i] = token fmt.Print(".") } fmt.Println(" OK") return tokens, nil } // phase5JoinNodes joins the remaining 4 nodes to the cluster (serial). func phase5JoinNodes(cfg *Config, state *SandboxState, tokens []string, sshKeyPath string) error { genesisIP := state.GenesisServer().IP for i := 1; i < len(state.Servers); i++ { srv := state.Servers[i] node := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath} token := tokens[i-1] var installCmd string if srv.Role == "nameserver" { installCmd = fmt.Sprintf("/opt/orama/bin/orama node install --join http://%s --token %s --vps-ip %s --domain %s --base-domain %s --nameserver --skip-checks", genesisIP, token, srv.IP, cfg.Domain, cfg.Domain) } else { installCmd = fmt.Sprintf("/opt/orama/bin/orama node install --join http://%s --token %s --vps-ip %s --base-domain %s --skip-checks", genesisIP, token, srv.IP, cfg.Domain) } fmt.Printf(" [%d/%d] Joining %s (%s, %s)...\n", i, len(state.Servers)-1, srv.Name, srv.IP, srv.Role) if err := remotessh.RunSSHStreaming(node, installCmd, remotessh.WithNoHostKeyCheck()); err != nil { return fmt.Errorf("join %s: %w", srv.Name, err) } // Wait for node health before proceeding fmt.Printf(" Waiting for %s health...", srv.Name) if err := waitForRQLiteHealth(node, 3*time.Minute); err != nil { fmt.Printf(" WARN: %v\n", err) } else { fmt.Println(" OK") } } return nil } // phase6Verify runs a basic cluster health check. func phase6Verify(cfg *Config, state *SandboxState, sshKeyPath string) { genesis := state.GenesisServer() node := inspector.Node{User: "root", Host: genesis.IP, SSHKey: sshKeyPath} // Check RQLite cluster out, err := runSSHOutput(node, "curl -s http://localhost:5001/status | grep -o '\"state\":\"[^\"]*\"' | head -1") if err == nil { fmt.Printf(" RQLite: %s\n", strings.TrimSpace(out)) } // Check DNS (if floating IPs configured, only with safe domain names) if len(cfg.FloatingIPs) > 0 && isSafeDNSName(cfg.Domain) { out, err = runSSHOutput(node, fmt.Sprintf("dig +short @%s test.%s 2>/dev/null || echo 'DNS not responding'", cfg.FloatingIPs[0].IP, cfg.Domain)) if err == nil { fmt.Printf(" DNS: %s\n", strings.TrimSpace(out)) } } } // waitForRQLiteHealth polls RQLite until it reports Leader or Follower state. func waitForRQLiteHealth(node inspector.Node, timeout time.Duration) error { deadline := time.Now().Add(timeout) for time.Now().Before(deadline) { out, err := runSSHOutput(node, "curl -sf http://localhost:5001/status 2>/dev/null | grep -o '\"state\":\"[^\"]*\"'") if err == nil { result := strings.TrimSpace(out) if strings.Contains(result, "Leader") || strings.Contains(result, "Follower") { return nil } } time.Sleep(5 * time.Second) } return fmt.Errorf("timeout waiting for RQLite health after %s", timeout) } // generateInviteToken runs `orama node invite` on the node and parses the token. func generateInviteToken(node inspector.Node) (string, error) { out, err := runSSHOutput(node, "/opt/orama/bin/orama node invite --expiry 1h 2>&1") if err != nil { return "", fmt.Errorf("invite command failed: %w", err) } // Parse token from output — the invite command outputs: // "sudo orama install --join https://... --token <64-char-hex> --vps-ip ..." // Look for the --token flag value first fields := strings.Fields(out) for i, field := range fields { if field == "--token" && i+1 < len(fields) { candidate := fields[i+1] if len(candidate) == 64 && isHex(candidate) { return candidate, nil } } } // Fallback: look for any standalone 64-char hex string for _, word := range fields { if len(word) == 64 && isHex(word) { return word, nil } } return "", fmt.Errorf("could not parse token from invite output:\n%s", out) } // isSafeDNSName returns true if the string is safe to use in shell commands. func isSafeDNSName(s string) bool { for _, c := range s { if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '.' || c == '-') { return false } } return len(s) > 0 } // isHex returns true if s contains only hex characters. func isHex(s string) bool { for _, c := range s { if !((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) { return false } } return true } // runSSHOutput runs a command via SSH and returns stdout as a string. // Uses StrictHostKeyChecking=no because sandbox IPs are frequently recycled. func runSSHOutput(node inspector.Node, command string) (string, error) { args := []string{ "ssh", "-n", "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", "-o", "ConnectTimeout=10", "-o", "BatchMode=yes", "-i", node.SSHKey, fmt.Sprintf("%s@%s", node.User, node.Host), command, } out, err := execCommand(args[0], args[1:]...) return string(out), err } // execCommand runs a command and returns its output. func execCommand(name string, args ...string) ([]byte, error) { return exec.Command(name, args...).Output() } // findNewestArchive finds the newest binary archive in /tmp/. func findNewestArchive() string { entries, err := os.ReadDir("/tmp") if err != nil { return "" } var best string var bestMod int64 for _, entry := range entries { name := entry.Name() if strings.HasPrefix(name, "orama-") && strings.Contains(name, "-linux-") && strings.HasSuffix(name, ".tar.gz") { info, err := entry.Info() if err != nil { continue } if info.ModTime().Unix() > bestMod { best = filepath.Join("/tmp", name) bestMod = info.ModTime().Unix() } } } return best } // formatBytes formats a byte count as human-readable. func formatBytes(b int64) string { const unit = 1024 if b < unit { return fmt.Sprintf("%d B", b) } div, exp := int64(unit), 0 for n := b / unit; n >= unit; n /= unit { div *= unit exp++ } return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "KMGTPE"[exp]) } // printCreateSummary prints the cluster summary after creation. func printCreateSummary(cfg *Config, state *SandboxState) { fmt.Printf("\nSandbox %q ready (%d nodes)\n", state.Name, len(state.Servers)) fmt.Println() fmt.Println("Nameservers:") for _, srv := range state.NameserverNodes() { floating := "" if srv.FloatingIP != "" { floating = fmt.Sprintf(" (floating: %s)", srv.FloatingIP) } fmt.Printf(" %s: %s%s\n", srv.Name, srv.IP, floating) } fmt.Println("Nodes:") for _, srv := range state.RegularNodes() { fmt.Printf(" %s: %s\n", srv.Name, srv.IP) } fmt.Println() fmt.Printf("Domain: %s\n", cfg.Domain) fmt.Printf("Gateway: https://%s\n", cfg.Domain) fmt.Println() fmt.Println("SSH: orama sandbox ssh 1") fmt.Println("Destroy: orama sandbox destroy") } // cleanupFailedCreate deletes any servers that were created during a failed provision. func cleanupFailedCreate(client *HetznerClient, state *SandboxState) { if len(state.Servers) == 0 { return } fmt.Println("\nCleaning up failed creation...") for _, srv := range state.Servers { if srv.ID > 0 { client.DeleteServer(srv.ID) fmt.Printf(" Deleted %s\n", srv.Name) } } DeleteState(state.Name) }