mirror of
https://github.com/DeBrosOfficial/orama.git
synced 2026-06-17 01:34:13 +00:00
fix(cli): node restart restores the caddy/coredns frontend
caddy/coredns declare Requires=orama-node.service, so stopping orama-node cascade-stops them; Requires propagates STOP but not START, and StartServicesOrdered only starts the orama services — so a bare 'orama node restart' left caddy dead and the node's :443 HTTPS frontend offline until the next reboot. Capture the active frontend units before the stop and restart exactly those after the gateway is healthy. Adds ServiceUnitExists helper + selectFrontendToRestore policy test.
This commit is contained in:
parent
d4bf187e94
commit
92428df7e9
@ -43,6 +43,16 @@ func HandleRestartWithFlags(force bool) {
|
||||
return
|
||||
}
|
||||
|
||||
// The TLS/DNS frontend (caddy, coredns) is NOT part of GetProductionServices,
|
||||
// but caddy/coredns declare `Requires=orama-node.service`, so stopping
|
||||
// orama-node below cascade-stops them via systemd. `Requires` propagates a
|
||||
// STOP but never a START, and StartServicesOrdered only starts the orama
|
||||
// services — so without this a bare `orama node restart` leaves caddy dead
|
||||
// and the node's :443 HTTPS frontend offline until the next reboot. Capture
|
||||
// which frontend units are running now and bring exactly those back at the
|
||||
// end, after the gateway is healthy (caddy's own ExecStartPre gates on it).
|
||||
frontendToRestore := activeFrontendServices()
|
||||
|
||||
// Stop namespace services first (same as stop command)
|
||||
fmt.Printf("\n Stopping namespace services...\n")
|
||||
stopAllNamespaceServices()
|
||||
@ -100,5 +110,45 @@ func HandleRestartWithFlags(force bool) {
|
||||
fmt.Printf("\n Starting services...\n")
|
||||
utils.StartServicesOrdered(services, "start")
|
||||
|
||||
// Bring the TLS/DNS frontend back up (see capture above). Done last so the
|
||||
// embedded gateway is already started; caddy's ExecStartPre then clears its
|
||||
// localhost:6001/health wait quickly instead of timing out.
|
||||
for _, svc := range frontendToRestore {
|
||||
if err := exec.Command("systemctl", "start", svc).Run(); err != nil {
|
||||
fmt.Printf(" Warning: Failed to start %s: %v\n", svc, err)
|
||||
} else {
|
||||
fmt.Printf(" Started %s\n", svc)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Printf("\n All services restarted\n")
|
||||
}
|
||||
|
||||
// frontendServices are the TLS/DNS units that sit in front of the node and are
|
||||
// torn down (but not brought back) by an orama-node restart — see HandleRestartWithFlags.
|
||||
var frontendServices = []string{"coredns", "caddy"}
|
||||
|
||||
// activeFrontendServices returns the frontend units that are installed AND
|
||||
// currently active, so a restart can restore exactly the set that was running.
|
||||
func activeFrontendServices() []string {
|
||||
return selectFrontendToRestore(frontendServices, func(svc string) bool {
|
||||
if !utils.ServiceUnitExists(svc) {
|
||||
return false
|
||||
}
|
||||
running, _ := utils.IsServiceActive(svc)
|
||||
return running
|
||||
})
|
||||
}
|
||||
|
||||
// selectFrontendToRestore filters candidates to those shouldRestore reports
|
||||
// true for, preserving order. Split from the systemd probing so the restore
|
||||
// policy is unit-testable without a live host.
|
||||
func selectFrontendToRestore(candidates []string, shouldRestore func(string) bool) []string {
|
||||
var out []string
|
||||
for _, svc := range candidates {
|
||||
if shouldRestore(svc) {
|
||||
out = append(out, svc)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
39
core/pkg/cli/production/lifecycle/restart_test.go
Normal file
39
core/pkg/cli/production/lifecycle/restart_test.go
Normal file
@ -0,0 +1,39 @@
|
||||
package lifecycle
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestSelectFrontendToRestore pins the restore policy behind the bare-restart
|
||||
// caddy fix: a restart must bring back exactly the frontend units (caddy,
|
||||
// coredns) that were running before orama-node's stop cascade-killed them —
|
||||
// no more, no less.
|
||||
func TestSelectFrontendToRestore(t *testing.T) {
|
||||
// Both active → both restored, order preserved.
|
||||
got := selectFrontendToRestore([]string{"coredns", "caddy"}, func(string) bool { return true })
|
||||
if !reflect.DeepEqual(got, []string{"coredns", "caddy"}) {
|
||||
t.Errorf("both active: expected [coredns caddy], got %v", got)
|
||||
}
|
||||
|
||||
// Only caddy active (e.g. a non-nameserver node where coredns isn't running)
|
||||
// → only caddy is restored.
|
||||
got = selectFrontendToRestore([]string{"coredns", "caddy"}, func(svc string) bool {
|
||||
return svc == "caddy"
|
||||
})
|
||||
if !reflect.DeepEqual(got, []string{"caddy"}) {
|
||||
t.Errorf("only caddy: expected [caddy], got %v", got)
|
||||
}
|
||||
|
||||
// Nothing active (units absent or intentionally stopped) → restore nothing,
|
||||
// so a restart never starts a frontend that was deliberately down.
|
||||
got = selectFrontendToRestore([]string{"coredns", "caddy"}, func(string) bool { return false })
|
||||
if len(got) != 0 {
|
||||
t.Errorf("none active: expected empty, got %v", got)
|
||||
}
|
||||
|
||||
// Empty candidate list is safe.
|
||||
if got := selectFrontendToRestore(nil, func(string) bool { return true }); len(got) != 0 {
|
||||
t.Errorf("nil candidates: expected empty, got %v", got)
|
||||
}
|
||||
}
|
||||
@ -105,6 +105,14 @@ func ResolveServiceName(alias string) ([]string, error) {
|
||||
return nil, fmt.Errorf("service %q not found. Use: node, ipfs, cluster, gateway, olric, or full service name", alias)
|
||||
}
|
||||
|
||||
// ServiceUnitExists reports whether a systemd unit file is installed for the
|
||||
// given service name (e.g. "caddy"). Used to guard restart/start logic so it
|
||||
// only touches services actually present on this node.
|
||||
func ServiceUnitExists(service string) bool {
|
||||
_, err := os.Stat(filepath.Join("/etc/systemd/system", service+".service"))
|
||||
return err == nil
|
||||
}
|
||||
|
||||
// IsServiceActive checks if a systemd service is currently active (running)
|
||||
func IsServiceActive(service string) (bool, error) {
|
||||
cmd := exec.Command("systemctl", "is-active", "--quiet", service)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user