orama/pkg/rqlite/rqlite.go
2026-02-13 12:47:02 +02:00

164 lines
4.5 KiB
Go

package rqlite
import (
"context"
"fmt"
"os"
"os/exec"
"syscall"
"time"
"github.com/DeBrosOfficial/network/migrations"
"github.com/DeBrosOfficial/network/pkg/config"
"github.com/rqlite/gorqlite"
"go.uber.org/zap"
)
// RQLiteManager manages an RQLite node instance
type RQLiteManager struct {
config *config.DatabaseConfig
discoverConfig *config.DiscoveryConfig
dataDir string
nodeType string // Node type identifier
logger *zap.Logger
cmd *exec.Cmd
connection *gorqlite.Connection
discoveryService *ClusterDiscoveryService
}
// NewRQLiteManager creates a new RQLite manager
func NewRQLiteManager(cfg *config.DatabaseConfig, discoveryCfg *config.DiscoveryConfig, dataDir string, logger *zap.Logger) *RQLiteManager {
return &RQLiteManager{
config: cfg,
discoverConfig: discoveryCfg,
dataDir: dataDir,
logger: logger.With(zap.String("component", "rqlite-manager")),
}
}
// Start starts the RQLite node
func (r *RQLiteManager) Start(ctx context.Context) error {
rqliteDataDir, err := r.prepareDataDir()
if err != nil {
return err
}
if r.discoverConfig.HttpAdvAddress == "" {
return fmt.Errorf("discovery config HttpAdvAddress is empty")
}
if r.discoveryService != nil {
if err := r.waitForMinClusterSizeBeforeStart(ctx, rqliteDataDir); err != nil {
return err
}
}
if needsClusterRecovery, err := r.checkNeedsClusterRecovery(rqliteDataDir); err == nil && needsClusterRecovery {
if err := r.performPreStartClusterDiscovery(ctx, rqliteDataDir); err != nil {
return err
}
}
if err := r.launchProcess(ctx, rqliteDataDir); err != nil {
return err
}
if err := r.waitForReadyAndConnect(ctx); err != nil {
return err
}
if r.discoveryService != nil {
go r.startHealthMonitoring(ctx)
go r.startVoterReconciliation(ctx)
}
// Start child process watchdog to detect and recover from crashes
go r.startProcessWatchdog(ctx)
// Start periodic RQLite backup loop (leader-only, self-checking)
go r.startBackupLoop(ctx)
if err := r.establishLeadershipOrJoin(ctx, rqliteDataDir); err != nil {
return err
}
// Apply embedded migrations - these are compiled into the binary
if err := r.ApplyEmbeddedMigrations(ctx, migrations.FS); err != nil {
r.logger.Error("Failed to apply embedded migrations", zap.Error(err))
// Don't fail startup - migrations may have already been applied by another node
// or we may be joining an existing cluster
} else {
r.logger.Info("Database migrations applied successfully")
}
return nil
}
// GetConnection returns the RQLite connection
func (r *RQLiteManager) GetConnection() *gorqlite.Connection {
return r.connection
}
// Stop stops the RQLite node gracefully.
// If this node is the Raft leader, it attempts a leadership transfer first
// to minimize cluster disruption.
func (r *RQLiteManager) Stop() error {
if r.connection != nil {
r.connection.Close()
r.connection = nil
}
if r.cmd == nil || r.cmd.Process == nil {
return nil
}
// Attempt leadership transfer if we are the leader
r.transferLeadershipIfLeader()
_ = r.cmd.Process.Signal(syscall.SIGTERM)
done := make(chan error, 1)
go func() { done <- r.cmd.Wait() }()
// Give RQLite 30s to flush pending writes and shut down gracefully
// (previously 5s which risked Raft log corruption)
select {
case <-done:
case <-time.After(30 * time.Second):
r.logger.Warn("RQLite did not stop within 30s, sending SIGKILL")
_ = r.cmd.Process.Kill()
}
// Clean up PID file
r.cleanupPIDFile()
return nil
}
// transferLeadershipIfLeader checks if this node is the Raft leader and
// requests a leadership transfer to minimize election disruption.
func (r *RQLiteManager) transferLeadershipIfLeader() {
status, err := r.getRQLiteStatus()
if err != nil {
return
}
if status.Store.Raft.State != "Leader" {
return
}
r.logger.Info("This node is the Raft leader, requesting leadership transfer before shutdown")
// RQLite doesn't have a direct leadership transfer API, but we can
// signal readiness to step down. The fastest approach is to let the
// SIGTERM handler in rqlited handle this — rqlite v8 gracefully
// steps down on SIGTERM when possible. We log the state for visibility.
r.logger.Info("Leader will transfer on SIGTERM (rqlite built-in behavior)")
}
// cleanupPIDFile removes the PID file on shutdown
func (r *RQLiteManager) cleanupPIDFile() {
logsDir := fmt.Sprintf("%s/../logs", r.dataDir)
pidPath := logsDir + "/rqlited.pid"
_ = os.Remove(pidPath)
}