fix(cli): node restart restores the caddy/coredns frontend

caddy/coredns declare Requires=orama-node.service, so stopping orama-node
cascade-stops them; Requires propagates STOP but not START, and
StartServicesOrdered only starts the orama services — so a bare
'orama node restart' left caddy dead and the node's :443 HTTPS frontend
offline until the next reboot. Capture the active frontend units before the
stop and restart exactly those after the gateway is healthy. Adds
ServiceUnitExists helper + selectFrontendToRestore policy test.
This commit is contained in:
anonpenguin23 2026-06-15 21:57:29 +03:00
parent d4bf187e94
commit 92428df7e9
3 changed files with 97 additions and 0 deletions

View File

@ -43,6 +43,16 @@ func HandleRestartWithFlags(force bool) {
return
}
// The TLS/DNS frontend (caddy, coredns) is NOT part of GetProductionServices,
// but caddy/coredns declare `Requires=orama-node.service`, so stopping
// orama-node below cascade-stops them via systemd. `Requires` propagates a
// STOP but never a START, and StartServicesOrdered only starts the orama
// services — so without this a bare `orama node restart` leaves caddy dead
// and the node's :443 HTTPS frontend offline until the next reboot. Capture
// which frontend units are running now and bring exactly those back at the
// end, after the gateway is healthy (caddy's own ExecStartPre gates on it).
frontendToRestore := activeFrontendServices()
// Stop namespace services first (same as stop command)
fmt.Printf("\n Stopping namespace services...\n")
stopAllNamespaceServices()
@ -100,5 +110,45 @@ func HandleRestartWithFlags(force bool) {
fmt.Printf("\n Starting services...\n")
utils.StartServicesOrdered(services, "start")
// Bring the TLS/DNS frontend back up (see capture above). Done last so the
// embedded gateway is already started; caddy's ExecStartPre then clears its
// localhost:6001/health wait quickly instead of timing out.
for _, svc := range frontendToRestore {
if err := exec.Command("systemctl", "start", svc).Run(); err != nil {
fmt.Printf(" Warning: Failed to start %s: %v\n", svc, err)
} else {
fmt.Printf(" Started %s\n", svc)
}
}
fmt.Printf("\n All services restarted\n")
}
// frontendServices are the TLS/DNS units that sit in front of the node and are
// torn down (but not brought back) by an orama-node restart — see HandleRestartWithFlags.
var frontendServices = []string{"coredns", "caddy"}
// activeFrontendServices returns the frontend units that are installed AND
// currently active, so a restart can restore exactly the set that was running.
func activeFrontendServices() []string {
return selectFrontendToRestore(frontendServices, func(svc string) bool {
if !utils.ServiceUnitExists(svc) {
return false
}
running, _ := utils.IsServiceActive(svc)
return running
})
}
// selectFrontendToRestore filters candidates to those shouldRestore reports
// true for, preserving order. Split from the systemd probing so the restore
// policy is unit-testable without a live host.
func selectFrontendToRestore(candidates []string, shouldRestore func(string) bool) []string {
var out []string
for _, svc := range candidates {
if shouldRestore(svc) {
out = append(out, svc)
}
}
return out
}

View File

@ -0,0 +1,39 @@
package lifecycle
import (
"reflect"
"testing"
)
// TestSelectFrontendToRestore pins the restore policy behind the bare-restart
// caddy fix: a restart must bring back exactly the frontend units (caddy,
// coredns) that were running before orama-node's stop cascade-killed them —
// no more, no less.
func TestSelectFrontendToRestore(t *testing.T) {
// Both active → both restored, order preserved.
got := selectFrontendToRestore([]string{"coredns", "caddy"}, func(string) bool { return true })
if !reflect.DeepEqual(got, []string{"coredns", "caddy"}) {
t.Errorf("both active: expected [coredns caddy], got %v", got)
}
// Only caddy active (e.g. a non-nameserver node where coredns isn't running)
// → only caddy is restored.
got = selectFrontendToRestore([]string{"coredns", "caddy"}, func(svc string) bool {
return svc == "caddy"
})
if !reflect.DeepEqual(got, []string{"caddy"}) {
t.Errorf("only caddy: expected [caddy], got %v", got)
}
// Nothing active (units absent or intentionally stopped) → restore nothing,
// so a restart never starts a frontend that was deliberately down.
got = selectFrontendToRestore([]string{"coredns", "caddy"}, func(string) bool { return false })
if len(got) != 0 {
t.Errorf("none active: expected empty, got %v", got)
}
// Empty candidate list is safe.
if got := selectFrontendToRestore(nil, func(string) bool { return true }); len(got) != 0 {
t.Errorf("nil candidates: expected empty, got %v", got)
}
}

View File

@ -105,6 +105,14 @@ func ResolveServiceName(alias string) ([]string, error) {
return nil, fmt.Errorf("service %q not found. Use: node, ipfs, cluster, gateway, olric, or full service name", alias)
}
// ServiceUnitExists reports whether a systemd unit file is installed for the
// given service name (e.g. "caddy"). Used to guard restart/start logic so it
// only touches services actually present on this node.
func ServiceUnitExists(service string) bool {
_, err := os.Stat(filepath.Join("/etc/systemd/system", service+".service"))
return err == nil
}
// IsServiceActive checks if a systemd service is currently active (running)
func IsServiceActive(service string) (bool, error) {
cmd := exec.Command("systemctl", "is-active", "--quiet", service)