orama/pkg/cli/production/lifecycle/post_upgrade.go
2026-02-14 14:14:04 +02:00

144 lines
4.0 KiB
Go

package lifecycle
import (
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"os/exec"
"time"
"github.com/DeBrosOfficial/network/pkg/cli/utils"
)
// HandlePostUpgrade brings the node back online after an upgrade:
// 1. Resets failed + unmasks + enables all services
// 2. Starts services in dependency order
// 3. Waits for global RQLite to be ready
// 4. Waits for each namespace RQLite to be ready
// 5. Removes maintenance flag
func HandlePostUpgrade() {
if os.Geteuid() != 0 {
fmt.Fprintf(os.Stderr, "Error: post-upgrade must be run as root (use sudo)\n")
os.Exit(1)
}
fmt.Printf("Post-upgrade: bringing node back online...\n")
// 1. Get all services
services := utils.GetProductionServices()
if len(services) == 0 {
fmt.Printf(" Warning: no Orama services found\n")
return
}
// Reset failed state
resetArgs := []string{"reset-failed"}
resetArgs = append(resetArgs, services...)
exec.Command("systemctl", resetArgs...).Run()
// Unmask and enable all services
for _, svc := range services {
masked, err := utils.IsServiceMasked(svc)
if err == nil && masked {
exec.Command("systemctl", "unmask", svc).Run()
}
enabled, err := utils.IsServiceEnabled(svc)
if err == nil && !enabled {
exec.Command("systemctl", "enable", svc).Run()
}
}
fmt.Printf(" Services reset and enabled\n")
// 2. Start services in dependency order
fmt.Printf(" Starting services...\n")
utils.StartServicesOrdered(services, "start")
fmt.Printf(" Services started\n")
// 3. Wait for global RQLite (port 5001) to be ready
fmt.Printf(" Waiting for global RQLite (port 5001)...\n")
if err := waitForRQLiteReady(5001, 120*time.Second); err != nil {
fmt.Printf(" Warning: global RQLite not ready: %v\n", err)
} else {
fmt.Printf(" Global RQLite ready\n")
}
// 4. Wait for each namespace RQLite with a global timeout of 5 minutes
nsPorts := getNamespaceRQLitePorts()
if len(nsPorts) > 0 {
fmt.Printf(" Waiting for %d namespace RQLite instances...\n", len(nsPorts))
globalDeadline := time.Now().Add(5 * time.Minute)
healthy := 0
failed := 0
for ns, port := range nsPorts {
remaining := time.Until(globalDeadline)
if remaining <= 0 {
fmt.Printf(" Warning: global timeout reached, skipping remaining namespaces\n")
failed += len(nsPorts) - healthy - failed
break
}
timeout := 90 * time.Second
if remaining < timeout {
timeout = remaining
}
fmt.Printf(" Waiting for namespace '%s' (port %d)...\n", ns, port)
if err := waitForRQLiteReady(port, timeout); err != nil {
fmt.Printf(" Warning: namespace '%s' RQLite not ready: %v\n", ns, err)
failed++
} else {
fmt.Printf(" Namespace '%s' ready\n", ns)
healthy++
}
}
fmt.Printf(" Namespace RQLite: %d healthy, %d failed\n", healthy, failed)
}
// 5. Remove maintenance flag
if err := os.Remove(maintenanceFlagPath); err != nil && !os.IsNotExist(err) {
fmt.Printf(" Warning: failed to remove maintenance flag: %v\n", err)
} else {
fmt.Printf(" Maintenance flag removed\n")
}
fmt.Printf("Post-upgrade complete. Node is back online.\n")
}
// waitForRQLiteReady polls an RQLite instance's /status endpoint until it
// reports Leader or Follower state, or the timeout expires.
func waitForRQLiteReady(port int, timeout time.Duration) error {
deadline := time.Now().Add(timeout)
client := &http.Client{Timeout: 2 * time.Second}
url := fmt.Sprintf("http://localhost:%d/status", port)
for time.Now().Before(deadline) {
resp, err := client.Get(url)
if err != nil {
time.Sleep(2 * time.Second)
continue
}
body, _ := io.ReadAll(resp.Body)
resp.Body.Close()
var status struct {
Store struct {
Raft struct {
State string `json:"state"`
} `json:"raft"`
} `json:"store"`
}
if err := json.Unmarshal(body, &status); err == nil {
state := status.Store.Raft.State
if state == "Leader" || state == "Follower" {
return nil
}
}
time.Sleep(2 * time.Second)
}
return fmt.Errorf("timeout after %s waiting for RQLite on port %d", timeout, port)
}