orama/pkg/rqlite/watchdog.go

package rqlite

import (
	"context"
	"fmt"
	"net/http"
	"time"

	"go.uber.org/zap"
)

const (
	// watchdogInterval is how often we check if rqlited is alive.
	watchdogInterval = 30 * time.Second

	// watchdogMaxRestart is the maximum number of restart attempts before giving up.
	watchdogMaxRestart = 3

	// watchdogGracePeriod is how long to wait after a restart before
	// the watchdog starts checking. This gives rqlited time to rejoin
	// the Raft cluster — Raft election timeouts + log replay can take
	// 60-120 seconds after a restart.
	watchdogGracePeriod = 120 * time.Second
)

// startProcessWatchdog monitors the RQLite child process and restarts it if it crashes.
// It only restarts when the process has actually DIED (exited). It does NOT kill
// rqlited for being slow to find a leader — that's normal during cluster rejoin.
func (r *RQLiteManager) startProcessWatchdog(ctx context.Context) {
	// Wait for the grace period before starting to monitor.
	// rqlited needs time to:
	// 1. Open the raft log and snapshots
	// 2. Reconnect to existing Raft peers
	// 3. Either rejoin as follower or participate in a new election
	select {
	case <-ctx.Done():
		return
	case <-time.After(watchdogGracePeriod):
	}

	ticker := time.NewTicker(watchdogInterval)
	defer ticker.Stop()

	restartCount := 0

	for {
		select {
		case <-ctx.Done():
			return
		case <-ticker.C:
			if !r.isProcessAlive() {
				r.logger.Error("RQLite process has died",
					zap.Int("restart_count", restartCount),
					zap.Int("max_restarts", watchdogMaxRestart))

				if restartCount >= watchdogMaxRestart {
					r.logger.Error("RQLite process watchdog: max restart attempts reached, giving up")
					return
				}

				if err := r.restartProcess(ctx); err != nil {
					r.logger.Error("Failed to restart RQLite process", zap.Error(err))
					restartCount++
					continue
				}

				restartCount++
				r.logger.Info("RQLite process restarted by watchdog",
					zap.Int("restart_count", restartCount))

				// Give the restarted process time to stabilize before checking again
				select {
				case <-ctx.Done():
					return
				case <-time.After(watchdogGracePeriod):
				}
			} else {
				// Process is alive — reset restart counter on sustained health
				if r.isHTTPResponsive() {
					if restartCount > 0 {
						r.logger.Info("RQLite process has stabilized, resetting restart counter",
							zap.Int("previous_restart_count", restartCount))
						restartCount = 0
					}
				}
			}
		}
	}
}

// isProcessAlive checks if the RQLite child process is still running
func (r *RQLiteManager) isProcessAlive() bool {
	if r.cmd == nil || r.cmd.Process == nil {
		return false
	}
	// On Unix, sending signal 0 checks process existence without actually signaling
	if err := r.cmd.Process.Signal(nil); err != nil {
		return false
	}
	return true
}

// isHTTPResponsive checks if RQLite is responding to HTTP status requests
func (r *RQLiteManager) isHTTPResponsive() bool {
	url := fmt.Sprintf("http://localhost:%d/status", r.config.RQLitePort)
	client := &http.Client{Timeout: 5 * time.Second}
	resp, err := client.Get(url)
	if err != nil {
		return false
	}
	defer resp.Body.Close()
	return resp.StatusCode == http.StatusOK
}

// restartProcess attempts to restart the RQLite process
func (r *RQLiteManager) restartProcess(ctx context.Context) error {
	rqliteDataDir, err := r.rqliteDataDirPath()
	if err != nil {
		return fmt.Errorf("get data dir: %w", err)
	}

	if err := r.launchProcess(ctx, rqliteDataDir); err != nil {
		return fmt.Errorf("launch process: %w", err)
	}

	if err := r.waitForReadyAndConnect(ctx); err != nil {
		return fmt.Errorf("wait for ready: %w", err)
	}

	return nil
}