feat(sandbox): optimize archive upload via server-to-server fanout

- add WithNoHostKeyCheck option for ephemeral server IPs
- upload binary to genesis then distribute to other nodes (faster)
- improve provisioning error handling for cleanup on partial failure
This commit is contained in:
anonpenguin23 2026-03-07 14:27:09 +02:00
parent e2b6f7d721
commit 6468019136
6 changed files with 131 additions and 56 deletions

View File

@ -12,7 +12,8 @@ import (
type SSHOption func(*sshOptions)
type sshOptions struct {
agentForward bool
agentForward bool
noHostKeyCheck bool
}
// WithAgentForward enables SSH agent forwarding (-A flag).
@ -21,22 +22,35 @@ func WithAgentForward() SSHOption {
return func(o *sshOptions) { o.agentForward = true }
}
// WithNoHostKeyCheck disables host key verification and uses /dev/null as known_hosts.
// Use for ephemeral servers (sandbox) where IPs are frequently recycled.
func WithNoHostKeyCheck() SSHOption {
return func(o *sshOptions) { o.noHostKeyCheck = true }
}
// UploadFile copies a local file to a remote host via SCP.
// Requires node.SSHKey to be set (via PrepareNodeKeys).
func UploadFile(node inspector.Node, localPath, remotePath string) error {
func UploadFile(node inspector.Node, localPath, remotePath string, opts ...SSHOption) error {
if node.SSHKey == "" {
return fmt.Errorf("no SSH key for %s (call PrepareNodeKeys first)", node.Name())
}
var cfg sshOptions
for _, o := range opts {
o(&cfg)
}
dest := fmt.Sprintf("%s@%s:%s", node.User, node.Host, remotePath)
cmd := exec.Command("scp",
"-o", "StrictHostKeyChecking=accept-new",
"-o", "ConnectTimeout=10",
"-i", node.SSHKey,
localPath, dest,
)
args := []string{"-o", "ConnectTimeout=10", "-i", node.SSHKey}
if cfg.noHostKeyCheck {
args = append([]string{"-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null"}, args...)
} else {
args = append([]string{"-o", "StrictHostKeyChecking=accept-new"}, args...)
}
args = append(args, localPath, dest)
cmd := exec.Command("scp", args...)
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
@ -59,10 +73,11 @@ func RunSSHStreaming(node inspector.Node, command string, opts ...SSHOption) err
o(&cfg)
}
args := []string{
"-o", "StrictHostKeyChecking=accept-new",
"-o", "ConnectTimeout=10",
"-i", node.SSHKey,
args := []string{"-o", "ConnectTimeout=10", "-i", node.SSHKey}
if cfg.noHostKeyCheck {
args = append([]string{"-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null"}, args...)
} else {
args = append([]string{"-o", "StrictHostKeyChecking=accept-new"}, args...)
}
if cfg.agentForward {
args = append(args, "-A")

View File

@ -138,10 +138,14 @@ func phase1ProvisionServers(client *HetznerClient, cfg *Config, state *SandboxSt
}
servers := make([]ServerState, 5)
var firstErr error
for i := 0; i < 5; i++ {
r := <-results
if r.err != nil {
return fmt.Errorf("server %d: %w", r.index+1, r.err)
if firstErr == nil {
firstErr = fmt.Errorf("server %d: %w", r.index+1, r.err)
}
continue
}
fmt.Printf(" Created %s (ID: %d, initializing...)\n", r.server.Name, r.server.ID)
role := "node"
@ -154,6 +158,10 @@ func phase1ProvisionServers(client *HetznerClient, cfg *Config, state *SandboxSt
Role: role,
}
}
state.Servers = servers // populate before returning so cleanup can delete created servers
if firstErr != nil {
return firstErr
}
// Wait for all servers to reach "running"
fmt.Print(" Waiting for servers to boot...")
@ -210,12 +218,12 @@ func phase2AssignFloatingIPs(client *HetznerClient, cfg *Config, state *SandboxS
}
// Wait for SSH to be ready on freshly booted servers
if err := waitForSSH(node, 2*time.Minute); err != nil {
if err := waitForSSH(node, 5*time.Minute); err != nil {
return fmt.Errorf("SSH not ready on %s: %w", srv.Name, err)
}
cmd := fmt.Sprintf("ip addr add %s/32 dev lo 2>/dev/null || true", fip.IP)
if err := remotessh.RunSSHStreaming(node, cmd); err != nil {
if err := remotessh.RunSSHStreaming(node, cmd, remotessh.WithNoHostKeyCheck()); err != nil {
return fmt.Errorf("configure loopback on %s: %w", srv.Name, err)
}
}
@ -236,9 +244,9 @@ func waitForSSH(node inspector.Node, timeout time.Duration) error {
return fmt.Errorf("timeout after %s", timeout)
}
// phase3UploadArchive builds (if needed) and uploads the binary archive to all nodes.
// phase3UploadArchive uploads the binary archive to the genesis node, then fans out
// to the remaining nodes server-to-server (much faster than uploading from local machine).
func phase3UploadArchive(cfg *Config, state *SandboxState) error {
// Find existing archive
archivePath := findNewestArchive()
if archivePath == "" {
fmt.Println(" No binary archive found, run `orama build` first")
@ -250,40 +258,73 @@ func phase3UploadArchive(cfg *Config, state *SandboxState) error {
sshKeyPath := cfg.ExpandedPrivateKeyPath()
remotePath := "/tmp/" + filepath.Base(archivePath)
extractCmd := fmt.Sprintf("mkdir -p /opt/orama && tar xzf %s -C /opt/orama && rm -f %s",
remotePath, remotePath)
// Upload to all 5 nodes in parallel
var wg sync.WaitGroup
errs := make([]error, len(state.Servers))
// Step 1: Upload from local machine to genesis node
genesis := state.Servers[0]
genesisNode := inspector.Node{User: "root", Host: genesis.IP, SSHKey: sshKeyPath}
for i, srv := range state.Servers {
wg.Add(1)
go func(idx int, srv ServerState) {
defer wg.Done()
node := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath}
if err := remotessh.UploadFile(node, archivePath, remotePath); err != nil {
errs[idx] = fmt.Errorf("upload to %s: %w", srv.Name, err)
return
}
// Extract + install CLI
extractCmd := fmt.Sprintf("mkdir -p /opt/orama && tar xzf %s -C /opt/orama && rm -f %s && cp /opt/orama/bin/orama /usr/local/bin/orama && chmod +x /usr/local/bin/orama",
remotePath, remotePath)
if err := remotessh.RunSSHStreaming(node, extractCmd); err != nil {
errs[idx] = fmt.Errorf("extract on %s: %w", srv.Name, err)
return
}
fmt.Printf(" Uploaded to %s\n", srv.Name)
}(i, srv)
fmt.Printf(" Uploading to %s (genesis)...\n", genesis.Name)
if err := remotessh.UploadFile(genesisNode, archivePath, remotePath, remotessh.WithNoHostKeyCheck()); err != nil {
return fmt.Errorf("upload to %s: %w", genesis.Name, err)
}
wg.Wait()
for _, err := range errs {
if err != nil {
return err
// Step 2: Fan out from genesis to remaining nodes in parallel (server-to-server)
if len(state.Servers) > 1 {
fmt.Printf(" Fanning out from %s to %d nodes...\n", genesis.Name, len(state.Servers)-1)
// Temporarily upload SSH key to genesis for server-to-server SCP
remoteKeyPath := "/tmp/.sandbox_key"
if err := remotessh.UploadFile(genesisNode, sshKeyPath, remoteKeyPath, remotessh.WithNoHostKeyCheck()); err != nil {
return fmt.Errorf("upload SSH key to genesis: %w", err)
}
// Always clean up the temporary key, even on panic/early return
defer remotessh.RunSSHStreaming(genesisNode, fmt.Sprintf("rm -f %s", remoteKeyPath), remotessh.WithNoHostKeyCheck())
if err := remotessh.RunSSHStreaming(genesisNode, fmt.Sprintf("chmod 600 %s", remoteKeyPath), remotessh.WithNoHostKeyCheck()); err != nil {
return fmt.Errorf("chmod SSH key on genesis: %w", err)
}
var wg sync.WaitGroup
errs := make([]error, len(state.Servers))
for i := 1; i < len(state.Servers); i++ {
wg.Add(1)
go func(idx int, srv ServerState) {
defer wg.Done()
// SCP from genesis to target using the uploaded key
scpCmd := fmt.Sprintf("scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i %s %s root@%s:%s",
remoteKeyPath, remotePath, srv.IP, remotePath)
if err := remotessh.RunSSHStreaming(genesisNode, scpCmd, remotessh.WithNoHostKeyCheck()); err != nil {
errs[idx] = fmt.Errorf("fanout to %s: %w", srv.Name, err)
return
}
// Extract on target
targetNode := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath}
if err := remotessh.RunSSHStreaming(targetNode, extractCmd, remotessh.WithNoHostKeyCheck()); err != nil {
errs[idx] = fmt.Errorf("extract on %s: %w", srv.Name, err)
return
}
fmt.Printf(" Distributed to %s\n", srv.Name)
}(i, state.Servers[i])
}
wg.Wait()
for _, err := range errs {
if err != nil {
return err
}
}
}
// Step 3: Extract on genesis
fmt.Printf(" Extracting on %s...\n", genesis.Name)
if err := remotessh.RunSSHStreaming(genesisNode, extractCmd, remotessh.WithNoHostKeyCheck()); err != nil {
return fmt.Errorf("extract on %s: %w", genesis.Name, err)
}
fmt.Println(" All nodes ready")
return nil
}
@ -294,10 +335,10 @@ func phase4InstallGenesis(cfg *Config, state *SandboxState) ([]string, error) {
node := inspector.Node{User: "root", Host: genesis.IP, SSHKey: sshKeyPath}
// Install genesis
installCmd := fmt.Sprintf("orama node install --vps-ip %s --domain %s --base-domain %s --nameserver --skip-checks",
installCmd := fmt.Sprintf("/opt/orama/bin/orama node install --vps-ip %s --domain %s --base-domain %s --nameserver --skip-checks",
genesis.IP, cfg.Domain, cfg.Domain)
fmt.Printf(" Installing on %s (%s)...\n", genesis.Name, genesis.IP)
if err := remotessh.RunSSHStreaming(node, installCmd); err != nil {
if err := remotessh.RunSSHStreaming(node, installCmd, remotessh.WithNoHostKeyCheck()); err != nil {
return nil, fmt.Errorf("install genesis: %w", err)
}
@ -338,15 +379,15 @@ func phase5JoinNodes(cfg *Config, state *SandboxState, tokens []string) error {
var installCmd string
if srv.Role == "nameserver" {
installCmd = fmt.Sprintf("orama node install --join http://%s --token %s --vps-ip %s --domain %s --base-domain %s --nameserver --skip-checks",
installCmd = fmt.Sprintf("/opt/orama/bin/orama node install --join http://%s --token %s --vps-ip %s --domain %s --base-domain %s --nameserver --skip-checks",
genesisIP, token, srv.IP, cfg.Domain, cfg.Domain)
} else {
installCmd = fmt.Sprintf("orama node install --join http://%s --token %s --vps-ip %s --base-domain %s --skip-checks",
installCmd = fmt.Sprintf("/opt/orama/bin/orama node install --join http://%s --token %s --vps-ip %s --base-domain %s --skip-checks",
genesisIP, token, srv.IP, cfg.Domain)
}
fmt.Printf(" [%d/%d] Joining %s (%s, %s)...\n", i, len(state.Servers)-1, srv.Name, srv.IP, srv.Role)
if err := remotessh.RunSSHStreaming(node, installCmd); err != nil {
if err := remotessh.RunSSHStreaming(node, installCmd, remotessh.WithNoHostKeyCheck()); err != nil {
return fmt.Errorf("join %s: %w", srv.Name, err)
}
@ -402,7 +443,7 @@ func waitForRQLiteHealth(node inspector.Node, timeout time.Duration) error {
// generateInviteToken runs `orama node invite` on the node and parses the token.
func generateInviteToken(node inspector.Node) (string, error) {
out, err := runSSHOutput(node, "orama node invite --expiry 1h 2>&1")
out, err := runSSHOutput(node, "/opt/orama/bin/orama node invite --expiry 1h 2>&1")
if err != nil {
return "", fmt.Errorf("invite command failed: %w", err)
}
@ -451,10 +492,12 @@ func isHex(s string) bool {
}
// runSSHOutput runs a command via SSH and returns stdout as a string.
// Uses StrictHostKeyChecking=no because sandbox IPs are frequently recycled.
func runSSHOutput(node inspector.Node, command string) (string, error) {
args := []string{
"ssh", "-n",
"-o", "StrictHostKeyChecking=accept-new",
"-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null",
"-o", "ConnectTimeout=10",
"-o", "BatchMode=yes",
"-i", node.SSHKey,

View File

@ -42,14 +42,14 @@ func Rollout(name string) error {
node := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath}
fmt.Printf(" [%d/%d] Uploading to %s...\n", i+1, len(state.Servers), srv.Name)
if err := remotessh.UploadFile(node, archivePath, remotePath); err != nil {
if err := remotessh.UploadFile(node, archivePath, remotePath, remotessh.WithNoHostKeyCheck()); err != nil {
return fmt.Errorf("upload to %s: %w", srv.Name, err)
}
// Extract archive
extractCmd := fmt.Sprintf("mkdir -p /opt/orama && tar xzf %s -C /opt/orama && rm -f %s",
remotePath, remotePath)
if err := remotessh.RunSSHStreaming(node, extractCmd); err != nil {
if err := remotessh.RunSSHStreaming(node, extractCmd, remotessh.WithNoHostKeyCheck()); err != nil {
return fmt.Errorf("extract on %s: %w", srv.Name, err)
}
}
@ -107,7 +107,7 @@ func upgradeNode(srv ServerState, sshKeyPath string, current, total int) error {
node := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath}
fmt.Printf(" [%d/%d] Upgrading %s (%s)...\n", current, total, srv.Name, srv.IP)
if err := remotessh.RunSSHStreaming(node, "orama node upgrade --restart"); err != nil {
if err := remotessh.RunSSHStreaming(node, "orama node upgrade --restart", remotessh.WithNoHostKeyCheck()); err != nil {
return fmt.Errorf("upgrade %s: %w", srv.Name, err)
}

View File

@ -36,7 +36,8 @@ func SSHInto(name string, nodeNum int) error {
// Replace current process with SSH
args := []string{
"ssh",
"-o", "StrictHostKeyChecking=accept-new",
"-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null",
"-i", sshKeyPath,
fmt.Sprintf("root@%s", srv.IP),
}

View File

@ -706,6 +706,20 @@ func (ps *ProductionSetup) Phase4GenerateConfigs(peerAddresses []string, vpsIP s
func (ps *ProductionSetup) Phase5CreateSystemdServices(enableHTTPS bool) error {
ps.logf("Phase 5: Creating systemd services...")
// Re-chown all orama directories to the orama user.
// Phases 2b-4 create files as root (IPFS repo, configs, secrets, etc.)
// that must be readable/writable by the orama service user.
if err := exec.Command("id", "orama").Run(); err == nil {
for _, dir := range []string{ps.oramaDir, filepath.Join(ps.oramaHome, "bin")} {
if _, statErr := os.Stat(dir); statErr == nil {
if output, chownErr := exec.Command("chown", "-R", "orama:orama", dir).CombinedOutput(); chownErr != nil {
ps.logf(" ⚠️ Failed to chown %s: %v\n%s", dir, chownErr, string(output))
}
}
}
ps.logf(" ✓ File ownership updated for orama user")
}
// Validate all required binaries are available before creating services
ipfsBinary, err := ps.binaryInstaller.ResolveBinaryPath("ipfs", "/usr/local/bin/ipfs", "/usr/bin/ipfs")
if err != nil {
@ -795,8 +809,9 @@ func (ps *ProductionSetup) Phase5CreateSystemdServices(enableHTTPS bool) error {
// Caddy service on ALL nodes (any node may host namespaces and need TLS)
if _, err := os.Stat("/usr/bin/caddy"); err == nil {
// Create caddy data directory
// Create caddy data directory and ensure orama user can write to it
exec.Command("mkdir", "-p", "/var/lib/caddy").Run()
exec.Command("chown", "-R", "orama:orama", "/var/lib/caddy").Run()
caddyUnit := ps.serviceGenerator.GenerateCaddyService()
if err := ps.serviceController.WriteServiceUnit("caddy.service", caddyUnit); err != nil {

View File

@ -424,6 +424,7 @@ Wants=orama-node.service
Type=simple
%[1]s
ReadWritePaths=%[2]s /var/lib/caddy /etc/caddy
Environment=XDG_DATA_HOME=/var/lib/caddy
AmbientCapabilities=CAP_NET_BIND_SERVICE
CapabilityBoundingSet=CAP_NET_BIND_SERVICE
ExecStart=/usr/bin/caddy run --environ --config /etc/caddy/Caddyfile