diff --git a/core/pkg/cli/production/lifecycle/restart.go b/core/pkg/cli/production/lifecycle/restart.go index 07e7e1d..075e678 100644 --- a/core/pkg/cli/production/lifecycle/restart.go +++ b/core/pkg/cli/production/lifecycle/restart.go @@ -43,6 +43,16 @@ func HandleRestartWithFlags(force bool) { return } + // The TLS/DNS frontend (caddy, coredns) is NOT part of GetProductionServices, + // but caddy/coredns declare `Requires=orama-node.service`, so stopping + // orama-node below cascade-stops them via systemd. `Requires` propagates a + // STOP but never a START, and StartServicesOrdered only starts the orama + // services — so without this a bare `orama node restart` leaves caddy dead + // and the node's :443 HTTPS frontend offline until the next reboot. Capture + // which frontend units are running now and bring exactly those back at the + // end, after the gateway is healthy (caddy's own ExecStartPre gates on it). + frontendToRestore := activeFrontendServices() + // Stop namespace services first (same as stop command) fmt.Printf("\n Stopping namespace services...\n") stopAllNamespaceServices() @@ -100,5 +110,45 @@ func HandleRestartWithFlags(force bool) { fmt.Printf("\n Starting services...\n") utils.StartServicesOrdered(services, "start") + // Bring the TLS/DNS frontend back up (see capture above). Done last so the + // embedded gateway is already started; caddy's ExecStartPre then clears its + // localhost:6001/health wait quickly instead of timing out. + for _, svc := range frontendToRestore { + if err := exec.Command("systemctl", "start", svc).Run(); err != nil { + fmt.Printf(" Warning: Failed to start %s: %v\n", svc, err) + } else { + fmt.Printf(" Started %s\n", svc) + } + } + fmt.Printf("\n All services restarted\n") } + +// frontendServices are the TLS/DNS units that sit in front of the node and are +// torn down (but not brought back) by an orama-node restart — see HandleRestartWithFlags. +var frontendServices = []string{"coredns", "caddy"} + +// activeFrontendServices returns the frontend units that are installed AND +// currently active, so a restart can restore exactly the set that was running. +func activeFrontendServices() []string { + return selectFrontendToRestore(frontendServices, func(svc string) bool { + if !utils.ServiceUnitExists(svc) { + return false + } + running, _ := utils.IsServiceActive(svc) + return running + }) +} + +// selectFrontendToRestore filters candidates to those shouldRestore reports +// true for, preserving order. Split from the systemd probing so the restore +// policy is unit-testable without a live host. +func selectFrontendToRestore(candidates []string, shouldRestore func(string) bool) []string { + var out []string + for _, svc := range candidates { + if shouldRestore(svc) { + out = append(out, svc) + } + } + return out +} diff --git a/core/pkg/cli/production/lifecycle/restart_test.go b/core/pkg/cli/production/lifecycle/restart_test.go new file mode 100644 index 0000000..ece18ec --- /dev/null +++ b/core/pkg/cli/production/lifecycle/restart_test.go @@ -0,0 +1,39 @@ +package lifecycle + +import ( + "reflect" + "testing" +) + +// TestSelectFrontendToRestore pins the restore policy behind the bare-restart +// caddy fix: a restart must bring back exactly the frontend units (caddy, +// coredns) that were running before orama-node's stop cascade-killed them — +// no more, no less. +func TestSelectFrontendToRestore(t *testing.T) { + // Both active → both restored, order preserved. + got := selectFrontendToRestore([]string{"coredns", "caddy"}, func(string) bool { return true }) + if !reflect.DeepEqual(got, []string{"coredns", "caddy"}) { + t.Errorf("both active: expected [coredns caddy], got %v", got) + } + + // Only caddy active (e.g. a non-nameserver node where coredns isn't running) + // → only caddy is restored. + got = selectFrontendToRestore([]string{"coredns", "caddy"}, func(svc string) bool { + return svc == "caddy" + }) + if !reflect.DeepEqual(got, []string{"caddy"}) { + t.Errorf("only caddy: expected [caddy], got %v", got) + } + + // Nothing active (units absent or intentionally stopped) → restore nothing, + // so a restart never starts a frontend that was deliberately down. + got = selectFrontendToRestore([]string{"coredns", "caddy"}, func(string) bool { return false }) + if len(got) != 0 { + t.Errorf("none active: expected empty, got %v", got) + } + + // Empty candidate list is safe. + if got := selectFrontendToRestore(nil, func(string) bool { return true }); len(got) != 0 { + t.Errorf("nil candidates: expected empty, got %v", got) + } +} diff --git a/core/pkg/cli/utils/systemd.go b/core/pkg/cli/utils/systemd.go index 2869f33..a356b4a 100644 --- a/core/pkg/cli/utils/systemd.go +++ b/core/pkg/cli/utils/systemd.go @@ -105,6 +105,14 @@ func ResolveServiceName(alias string) ([]string, error) { return nil, fmt.Errorf("service %q not found. Use: node, ipfs, cluster, gateway, olric, or full service name", alias) } +// ServiceUnitExists reports whether a systemd unit file is installed for the +// given service name (e.g. "caddy"). Used to guard restart/start logic so it +// only touches services actually present on this node. +func ServiceUnitExists(service string) bool { + _, err := os.Stat(filepath.Join("/etc/systemd/system", service+".service")) + return err == nil +} + // IsServiceActive checks if a systemd service is currently active (running) func IsServiceActive(service string) (bool, error) { cmd := exec.Command("systemctl", "is-active", "--quiet", service)