From 646801913690bc9c8c0c2f328b1d61d2d50f0634 Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Sat, 7 Mar 2026 14:27:09 +0200 Subject: [PATCH] feat(sandbox): optimize archive upload via server-to-server fanout - add WithNoHostKeyCheck option for ephemeral server IPs - upload binary to genesis then distribute to other nodes (faster) - improve provisioning error handling for cleanup on partial failure --- pkg/cli/remotessh/ssh.go | 39 +++++-- pkg/cli/sandbox/create.go | 121 +++++++++++++------- pkg/cli/sandbox/rollout.go | 6 +- pkg/cli/sandbox/ssh_cmd.go | 3 +- pkg/environments/production/orchestrator.go | 17 ++- pkg/environments/production/services.go | 1 + 6 files changed, 131 insertions(+), 56 deletions(-) diff --git a/pkg/cli/remotessh/ssh.go b/pkg/cli/remotessh/ssh.go index 803c384..3ce5157 100644 --- a/pkg/cli/remotessh/ssh.go +++ b/pkg/cli/remotessh/ssh.go @@ -12,7 +12,8 @@ import ( type SSHOption func(*sshOptions) type sshOptions struct { - agentForward bool + agentForward bool + noHostKeyCheck bool } // WithAgentForward enables SSH agent forwarding (-A flag). @@ -21,22 +22,35 @@ func WithAgentForward() SSHOption { return func(o *sshOptions) { o.agentForward = true } } +// WithNoHostKeyCheck disables host key verification and uses /dev/null as known_hosts. +// Use for ephemeral servers (sandbox) where IPs are frequently recycled. +func WithNoHostKeyCheck() SSHOption { + return func(o *sshOptions) { o.noHostKeyCheck = true } +} + // UploadFile copies a local file to a remote host via SCP. // Requires node.SSHKey to be set (via PrepareNodeKeys). -func UploadFile(node inspector.Node, localPath, remotePath string) error { +func UploadFile(node inspector.Node, localPath, remotePath string, opts ...SSHOption) error { if node.SSHKey == "" { return fmt.Errorf("no SSH key for %s (call PrepareNodeKeys first)", node.Name()) } + var cfg sshOptions + for _, o := range opts { + o(&cfg) + } + dest := fmt.Sprintf("%s@%s:%s", node.User, node.Host, remotePath) - cmd := exec.Command("scp", - "-o", "StrictHostKeyChecking=accept-new", - "-o", "ConnectTimeout=10", - "-i", node.SSHKey, - localPath, dest, - ) + args := []string{"-o", "ConnectTimeout=10", "-i", node.SSHKey} + if cfg.noHostKeyCheck { + args = append([]string{"-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null"}, args...) + } else { + args = append([]string{"-o", "StrictHostKeyChecking=accept-new"}, args...) + } + args = append(args, localPath, dest) + cmd := exec.Command("scp", args...) cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr @@ -59,10 +73,11 @@ func RunSSHStreaming(node inspector.Node, command string, opts ...SSHOption) err o(&cfg) } - args := []string{ - "-o", "StrictHostKeyChecking=accept-new", - "-o", "ConnectTimeout=10", - "-i", node.SSHKey, + args := []string{"-o", "ConnectTimeout=10", "-i", node.SSHKey} + if cfg.noHostKeyCheck { + args = append([]string{"-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null"}, args...) + } else { + args = append([]string{"-o", "StrictHostKeyChecking=accept-new"}, args...) } if cfg.agentForward { args = append(args, "-A") diff --git a/pkg/cli/sandbox/create.go b/pkg/cli/sandbox/create.go index 292e1d9..cac3bd0 100644 --- a/pkg/cli/sandbox/create.go +++ b/pkg/cli/sandbox/create.go @@ -138,10 +138,14 @@ func phase1ProvisionServers(client *HetznerClient, cfg *Config, state *SandboxSt } servers := make([]ServerState, 5) + var firstErr error for i := 0; i < 5; i++ { r := <-results if r.err != nil { - return fmt.Errorf("server %d: %w", r.index+1, r.err) + if firstErr == nil { + firstErr = fmt.Errorf("server %d: %w", r.index+1, r.err) + } + continue } fmt.Printf(" Created %s (ID: %d, initializing...)\n", r.server.Name, r.server.ID) role := "node" @@ -154,6 +158,10 @@ func phase1ProvisionServers(client *HetznerClient, cfg *Config, state *SandboxSt Role: role, } } + state.Servers = servers // populate before returning so cleanup can delete created servers + if firstErr != nil { + return firstErr + } // Wait for all servers to reach "running" fmt.Print(" Waiting for servers to boot...") @@ -210,12 +218,12 @@ func phase2AssignFloatingIPs(client *HetznerClient, cfg *Config, state *SandboxS } // Wait for SSH to be ready on freshly booted servers - if err := waitForSSH(node, 2*time.Minute); err != nil { + if err := waitForSSH(node, 5*time.Minute); err != nil { return fmt.Errorf("SSH not ready on %s: %w", srv.Name, err) } cmd := fmt.Sprintf("ip addr add %s/32 dev lo 2>/dev/null || true", fip.IP) - if err := remotessh.RunSSHStreaming(node, cmd); err != nil { + if err := remotessh.RunSSHStreaming(node, cmd, remotessh.WithNoHostKeyCheck()); err != nil { return fmt.Errorf("configure loopback on %s: %w", srv.Name, err) } } @@ -236,9 +244,9 @@ func waitForSSH(node inspector.Node, timeout time.Duration) error { return fmt.Errorf("timeout after %s", timeout) } -// phase3UploadArchive builds (if needed) and uploads the binary archive to all nodes. +// phase3UploadArchive uploads the binary archive to the genesis node, then fans out +// to the remaining nodes server-to-server (much faster than uploading from local machine). func phase3UploadArchive(cfg *Config, state *SandboxState) error { - // Find existing archive archivePath := findNewestArchive() if archivePath == "" { fmt.Println(" No binary archive found, run `orama build` first") @@ -250,40 +258,73 @@ func phase3UploadArchive(cfg *Config, state *SandboxState) error { sshKeyPath := cfg.ExpandedPrivateKeyPath() remotePath := "/tmp/" + filepath.Base(archivePath) + extractCmd := fmt.Sprintf("mkdir -p /opt/orama && tar xzf %s -C /opt/orama && rm -f %s", + remotePath, remotePath) - // Upload to all 5 nodes in parallel - var wg sync.WaitGroup - errs := make([]error, len(state.Servers)) + // Step 1: Upload from local machine to genesis node + genesis := state.Servers[0] + genesisNode := inspector.Node{User: "root", Host: genesis.IP, SSHKey: sshKeyPath} - for i, srv := range state.Servers { - wg.Add(1) - go func(idx int, srv ServerState) { - defer wg.Done() - node := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath} - - if err := remotessh.UploadFile(node, archivePath, remotePath); err != nil { - errs[idx] = fmt.Errorf("upload to %s: %w", srv.Name, err) - return - } - - // Extract + install CLI - extractCmd := fmt.Sprintf("mkdir -p /opt/orama && tar xzf %s -C /opt/orama && rm -f %s && cp /opt/orama/bin/orama /usr/local/bin/orama && chmod +x /usr/local/bin/orama", - remotePath, remotePath) - if err := remotessh.RunSSHStreaming(node, extractCmd); err != nil { - errs[idx] = fmt.Errorf("extract on %s: %w", srv.Name, err) - return - } - fmt.Printf(" Uploaded to %s\n", srv.Name) - }(i, srv) + fmt.Printf(" Uploading to %s (genesis)...\n", genesis.Name) + if err := remotessh.UploadFile(genesisNode, archivePath, remotePath, remotessh.WithNoHostKeyCheck()); err != nil { + return fmt.Errorf("upload to %s: %w", genesis.Name, err) } - wg.Wait() - for _, err := range errs { - if err != nil { - return err + // Step 2: Fan out from genesis to remaining nodes in parallel (server-to-server) + if len(state.Servers) > 1 { + fmt.Printf(" Fanning out from %s to %d nodes...\n", genesis.Name, len(state.Servers)-1) + + // Temporarily upload SSH key to genesis for server-to-server SCP + remoteKeyPath := "/tmp/.sandbox_key" + if err := remotessh.UploadFile(genesisNode, sshKeyPath, remoteKeyPath, remotessh.WithNoHostKeyCheck()); err != nil { + return fmt.Errorf("upload SSH key to genesis: %w", err) + } + // Always clean up the temporary key, even on panic/early return + defer remotessh.RunSSHStreaming(genesisNode, fmt.Sprintf("rm -f %s", remoteKeyPath), remotessh.WithNoHostKeyCheck()) + + if err := remotessh.RunSSHStreaming(genesisNode, fmt.Sprintf("chmod 600 %s", remoteKeyPath), remotessh.WithNoHostKeyCheck()); err != nil { + return fmt.Errorf("chmod SSH key on genesis: %w", err) + } + + var wg sync.WaitGroup + errs := make([]error, len(state.Servers)) + + for i := 1; i < len(state.Servers); i++ { + wg.Add(1) + go func(idx int, srv ServerState) { + defer wg.Done() + // SCP from genesis to target using the uploaded key + scpCmd := fmt.Sprintf("scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i %s %s root@%s:%s", + remoteKeyPath, remotePath, srv.IP, remotePath) + if err := remotessh.RunSSHStreaming(genesisNode, scpCmd, remotessh.WithNoHostKeyCheck()); err != nil { + errs[idx] = fmt.Errorf("fanout to %s: %w", srv.Name, err) + return + } + // Extract on target + targetNode := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath} + if err := remotessh.RunSSHStreaming(targetNode, extractCmd, remotessh.WithNoHostKeyCheck()); err != nil { + errs[idx] = fmt.Errorf("extract on %s: %w", srv.Name, err) + return + } + fmt.Printf(" Distributed to %s\n", srv.Name) + }(i, state.Servers[i]) + } + wg.Wait() + + for _, err := range errs { + if err != nil { + return err + } } } + // Step 3: Extract on genesis + fmt.Printf(" Extracting on %s...\n", genesis.Name) + if err := remotessh.RunSSHStreaming(genesisNode, extractCmd, remotessh.WithNoHostKeyCheck()); err != nil { + return fmt.Errorf("extract on %s: %w", genesis.Name, err) + } + + fmt.Println(" All nodes ready") return nil } @@ -294,10 +335,10 @@ func phase4InstallGenesis(cfg *Config, state *SandboxState) ([]string, error) { node := inspector.Node{User: "root", Host: genesis.IP, SSHKey: sshKeyPath} // Install genesis - installCmd := fmt.Sprintf("orama node install --vps-ip %s --domain %s --base-domain %s --nameserver --skip-checks", + installCmd := fmt.Sprintf("/opt/orama/bin/orama node install --vps-ip %s --domain %s --base-domain %s --nameserver --skip-checks", genesis.IP, cfg.Domain, cfg.Domain) fmt.Printf(" Installing on %s (%s)...\n", genesis.Name, genesis.IP) - if err := remotessh.RunSSHStreaming(node, installCmd); err != nil { + if err := remotessh.RunSSHStreaming(node, installCmd, remotessh.WithNoHostKeyCheck()); err != nil { return nil, fmt.Errorf("install genesis: %w", err) } @@ -338,15 +379,15 @@ func phase5JoinNodes(cfg *Config, state *SandboxState, tokens []string) error { var installCmd string if srv.Role == "nameserver" { - installCmd = fmt.Sprintf("orama node install --join http://%s --token %s --vps-ip %s --domain %s --base-domain %s --nameserver --skip-checks", + installCmd = fmt.Sprintf("/opt/orama/bin/orama node install --join http://%s --token %s --vps-ip %s --domain %s --base-domain %s --nameserver --skip-checks", genesisIP, token, srv.IP, cfg.Domain, cfg.Domain) } else { - installCmd = fmt.Sprintf("orama node install --join http://%s --token %s --vps-ip %s --base-domain %s --skip-checks", + installCmd = fmt.Sprintf("/opt/orama/bin/orama node install --join http://%s --token %s --vps-ip %s --base-domain %s --skip-checks", genesisIP, token, srv.IP, cfg.Domain) } fmt.Printf(" [%d/%d] Joining %s (%s, %s)...\n", i, len(state.Servers)-1, srv.Name, srv.IP, srv.Role) - if err := remotessh.RunSSHStreaming(node, installCmd); err != nil { + if err := remotessh.RunSSHStreaming(node, installCmd, remotessh.WithNoHostKeyCheck()); err != nil { return fmt.Errorf("join %s: %w", srv.Name, err) } @@ -402,7 +443,7 @@ func waitForRQLiteHealth(node inspector.Node, timeout time.Duration) error { // generateInviteToken runs `orama node invite` on the node and parses the token. func generateInviteToken(node inspector.Node) (string, error) { - out, err := runSSHOutput(node, "orama node invite --expiry 1h 2>&1") + out, err := runSSHOutput(node, "/opt/orama/bin/orama node invite --expiry 1h 2>&1") if err != nil { return "", fmt.Errorf("invite command failed: %w", err) } @@ -451,10 +492,12 @@ func isHex(s string) bool { } // runSSHOutput runs a command via SSH and returns stdout as a string. +// Uses StrictHostKeyChecking=no because sandbox IPs are frequently recycled. func runSSHOutput(node inspector.Node, command string) (string, error) { args := []string{ "ssh", "-n", - "-o", "StrictHostKeyChecking=accept-new", + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", "-o", "ConnectTimeout=10", "-o", "BatchMode=yes", "-i", node.SSHKey, diff --git a/pkg/cli/sandbox/rollout.go b/pkg/cli/sandbox/rollout.go index 8c7ccfd..ac186ee 100644 --- a/pkg/cli/sandbox/rollout.go +++ b/pkg/cli/sandbox/rollout.go @@ -42,14 +42,14 @@ func Rollout(name string) error { node := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath} fmt.Printf(" [%d/%d] Uploading to %s...\n", i+1, len(state.Servers), srv.Name) - if err := remotessh.UploadFile(node, archivePath, remotePath); err != nil { + if err := remotessh.UploadFile(node, archivePath, remotePath, remotessh.WithNoHostKeyCheck()); err != nil { return fmt.Errorf("upload to %s: %w", srv.Name, err) } // Extract archive extractCmd := fmt.Sprintf("mkdir -p /opt/orama && tar xzf %s -C /opt/orama && rm -f %s", remotePath, remotePath) - if err := remotessh.RunSSHStreaming(node, extractCmd); err != nil { + if err := remotessh.RunSSHStreaming(node, extractCmd, remotessh.WithNoHostKeyCheck()); err != nil { return fmt.Errorf("extract on %s: %w", srv.Name, err) } } @@ -107,7 +107,7 @@ func upgradeNode(srv ServerState, sshKeyPath string, current, total int) error { node := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath} fmt.Printf(" [%d/%d] Upgrading %s (%s)...\n", current, total, srv.Name, srv.IP) - if err := remotessh.RunSSHStreaming(node, "orama node upgrade --restart"); err != nil { + if err := remotessh.RunSSHStreaming(node, "orama node upgrade --restart", remotessh.WithNoHostKeyCheck()); err != nil { return fmt.Errorf("upgrade %s: %w", srv.Name, err) } diff --git a/pkg/cli/sandbox/ssh_cmd.go b/pkg/cli/sandbox/ssh_cmd.go index ed3bf61..f09ef08 100644 --- a/pkg/cli/sandbox/ssh_cmd.go +++ b/pkg/cli/sandbox/ssh_cmd.go @@ -36,7 +36,8 @@ func SSHInto(name string, nodeNum int) error { // Replace current process with SSH args := []string{ "ssh", - "-o", "StrictHostKeyChecking=accept-new", + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", "-i", sshKeyPath, fmt.Sprintf("root@%s", srv.IP), } diff --git a/pkg/environments/production/orchestrator.go b/pkg/environments/production/orchestrator.go index 339e7d3..fce62b0 100644 --- a/pkg/environments/production/orchestrator.go +++ b/pkg/environments/production/orchestrator.go @@ -706,6 +706,20 @@ func (ps *ProductionSetup) Phase4GenerateConfigs(peerAddresses []string, vpsIP s func (ps *ProductionSetup) Phase5CreateSystemdServices(enableHTTPS bool) error { ps.logf("Phase 5: Creating systemd services...") + // Re-chown all orama directories to the orama user. + // Phases 2b-4 create files as root (IPFS repo, configs, secrets, etc.) + // that must be readable/writable by the orama service user. + if err := exec.Command("id", "orama").Run(); err == nil { + for _, dir := range []string{ps.oramaDir, filepath.Join(ps.oramaHome, "bin")} { + if _, statErr := os.Stat(dir); statErr == nil { + if output, chownErr := exec.Command("chown", "-R", "orama:orama", dir).CombinedOutput(); chownErr != nil { + ps.logf(" ⚠️ Failed to chown %s: %v\n%s", dir, chownErr, string(output)) + } + } + } + ps.logf(" ✓ File ownership updated for orama user") + } + // Validate all required binaries are available before creating services ipfsBinary, err := ps.binaryInstaller.ResolveBinaryPath("ipfs", "/usr/local/bin/ipfs", "/usr/bin/ipfs") if err != nil { @@ -795,8 +809,9 @@ func (ps *ProductionSetup) Phase5CreateSystemdServices(enableHTTPS bool) error { // Caddy service on ALL nodes (any node may host namespaces and need TLS) if _, err := os.Stat("/usr/bin/caddy"); err == nil { - // Create caddy data directory + // Create caddy data directory and ensure orama user can write to it exec.Command("mkdir", "-p", "/var/lib/caddy").Run() + exec.Command("chown", "-R", "orama:orama", "/var/lib/caddy").Run() caddyUnit := ps.serviceGenerator.GenerateCaddyService() if err := ps.serviceController.WriteServiceUnit("caddy.service", caddyUnit); err != nil { diff --git a/pkg/environments/production/services.go b/pkg/environments/production/services.go index 6c08da9..24c2a37 100644 --- a/pkg/environments/production/services.go +++ b/pkg/environments/production/services.go @@ -424,6 +424,7 @@ Wants=orama-node.service Type=simple %[1]s ReadWritePaths=%[2]s /var/lib/caddy /etc/caddy +Environment=XDG_DATA_HOME=/var/lib/caddy AmbientCapabilities=CAP_NET_BIND_SERVICE CapabilityBoundingSet=CAP_NET_BIND_SERVICE ExecStart=/usr/bin/caddy run --environ --config /etc/caddy/Caddyfile