orama/core/pkg/namespace/leader_locality.go
anonpenguin23 9c213a166c feat(serverless,namespace): cut namespace gateway RPC latency (#708)
The 5-10s RPCs that broke calling were not cold-start — they were
per-RPC sequential rqlite reads, each forwarded to a raft leader that
geography-blind election had placed on a 256ms-distant node.

Lever A (serverless): cache function metadata + env vars in-process
(5s TTL, invalidated on deploy/enable/disable/delete) and stop the hot
invoke path re-fetching the function for the authorization check —
removes ~820ms of leader-routed pre-flight reads from every op.

Lever B (namespace): a locality-aware leadership reconciler hands raft
leadership off a geographically-isolated namespace leader to the nearest
co-located voter, via rqlite's transfer-leadership API. All nodes stay
voters — membership, quorum and fault tolerance are unchanged. Cuts the
per-hop cost from ~274ms to ~20ms when a distant node had become leader.
2026-06-15 08:05:38 +03:00

214 lines
7.7 KiB
Go

package namespace
import (
"context"
"net"
"path/filepath"
"time"
"github.com/DeBrosOfficial/network/pkg/rqlite"
"go.uber.org/zap"
)
// Bugboard #708 — namespace raft leadership is geography-blind: the initial
// leader is sortedNodeIDs[0] over random libp2p peer IDs, and raft re-elects
// freely on every restart. When a geographically-distant node (high WireGuard
// RTT to its peers) becomes the leader, EVERY namespace write funnels through
// the distant node and waits on its cross-region replication for quorum — each
// rqlite hop jumps from ~20ms (co-located) to ~256ms, stacking into 5-10s RPCs
// that break calling.
//
// This reconciler keeps namespace leadership on a co-located voter. It NEVER
// removes a node or changes voter membership — all nodes stay voters (quorum
// and fault tolerance unchanged). It only hands leadership OFF a node that is
// isolated from the rest of the cluster, using rqlite's own
// transfer-leadership API.
const (
// leaderLocalityInterval is how often each node checks whether the
// namespace clusters it leads are well-placed.
leaderLocalityInterval = 90 * time.Second
// leaderLocalityRTTThreshold: if the leader's CLOSEST voter peer is farther
// than this, the leader is treated as geographically isolated and hands off
// leadership. Co-located nodes are ~20ms apart; a distant node is ~256ms —
// 100ms cleanly separates the two without false positives.
leaderLocalityRTTThreshold = 100 * time.Millisecond
// leaderLocalityCooldown bounds how often a single namespace's leadership
// is moved. In the common topology (a lone distant node among co-located
// peers) ONE transfer settles leadership on a co-located voter, which then
// stays (it has a nearby peer, so it never re-triggers). In a pathological
// all-mutually-distant topology there is no good leader to move to and the
// nearest-peer transfer would rotate; the cooldown caps that to roughly one
// transfer per node per window (bounded, non-destructive — membership and
// quorum are never touched), and node selection clustering most nodes
// ~20ms apart makes that case rare.
leaderLocalityCooldown = 10 * time.Minute
// leaderLocalityDialTimeout bounds each per-peer RTT probe.
leaderLocalityDialTimeout = 3 * time.Second
)
// decideLeadershipTransfer is the pure decision: should the local leader hand
// off leadership, and to which voter? peerRTTs maps each OTHER reachable voter's
// raft address → measured RTT. Returns a target and true ONLY when this node is
// the leader, every voter is reachable (don't destabilize an already-degraded
// cluster), the cooldown has elapsed, and even the CLOSEST peer is farther than
// `threshold` — i.e. the leader is isolated. If the leader has at least one
// nearby voter it is central enough; leave it. The chosen target is the nearest
// reachable peer (which, in a 1-distant/N-close topology, is a co-located node
// that will then have a nearby peer of its own → stable).
func decideLeadershipTransfer(isLeader, allVotersReachable, cooldownElapsed bool, peerRTTs map[string]time.Duration, threshold time.Duration) (string, bool) {
if !isLeader || !allVotersReachable || !cooldownElapsed || len(peerRTTs) == 0 {
return "", false
}
var bestAddr string
var bestRTT time.Duration
for addr, rtt := range peerRTTs {
if bestAddr == "" || rtt < bestRTT {
bestAddr, bestRTT = addr, rtt
}
}
if bestRTT > threshold {
return bestAddr, true
}
return "", false
}
// measurePeerRTTs probes every OTHER voter's raft address and returns their
// RTTs plus whether ALL voters were reachable+measurable (so the caller can
// refuse to act on a degraded cluster). Non-voters and self are skipped.
func measurePeerRTTs(nodes rqlite.RQLiteNodes, selfID string) (map[string]time.Duration, bool) {
peerRTTs := make(map[string]time.Duration)
allReachable := true
for _, n := range nodes {
if !n.Voter || n.ID == selfID {
continue
}
if !n.Reachable {
allReachable = false
continue
}
dialAddr := n.Address
if dialAddr == "" {
dialAddr = n.ID
}
rtt, derr := measureRaftRTT(dialAddr, leaderLocalityDialTimeout)
if derr != nil {
allReachable = false
continue
}
peerRTTs[n.ID] = rtt
}
return peerRTTs, allReachable
}
// measureRaftRTT returns the TCP-connect time to a peer's raft address — a
// privilege-free proxy for WireGuard round-trip latency.
func measureRaftRTT(raftAddr string, timeout time.Duration) (time.Duration, error) {
start := time.Now()
conn, err := net.DialTimeout("tcp", raftAddr, timeout)
if err != nil {
return 0, err
}
_ = conn.Close()
return time.Since(start), nil
}
func (cm *ClusterManager) leaderTransferCooldownElapsed(namespace string) bool {
cm.leaderLocalityMu.Lock()
defer cm.leaderLocalityMu.Unlock()
last, ok := cm.leaderLocalityCooldown[namespace]
return !ok || time.Since(last) >= leaderLocalityCooldown
}
func (cm *ClusterManager) recordLeaderTransfer(namespace string) {
cm.leaderLocalityMu.Lock()
defer cm.leaderLocalityMu.Unlock()
if cm.leaderLocalityCooldown == nil {
cm.leaderLocalityCooldown = make(map[string]time.Time)
}
cm.leaderLocalityCooldown[namespace] = time.Now()
}
// StartLeaderLocalityReconciler runs the periodic leadership-locality check
// until ctx is cancelled. Safe to call once at node boot.
func (cm *ClusterManager) StartLeaderLocalityReconciler(ctx context.Context) {
go func() {
ticker := time.NewTicker(leaderLocalityInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
cm.reconcileLeaderLocality(ctx)
}
}
}()
}
// reconcileLeaderLocality checks every namespace cluster this node hosts and,
// for any it currently leads from an isolated position, transfers leadership to
// the nearest co-located voter.
func (cm *ClusterManager) reconcileLeaderLocality(ctx context.Context) {
pattern := filepath.Join(cm.baseDataDir, "*", "cluster-state.json")
matches, err := filepath.Glob(pattern)
if err != nil {
cm.logger.Debug("leader-locality: glob failed", zap.Error(err))
return
}
for _, path := range matches {
if ctx.Err() != nil {
return
}
state, err := loadLocalState(path)
if err != nil {
continue
}
cm.reconcileNamespaceLeader(state.NamespaceName, state.LocalPorts.RQLiteHTTPPort)
}
}
// reconcileNamespaceLeader handles a single namespace's leadership locality.
func (cm *ClusterManager) reconcileNamespaceLeader(namespace string, rqliteHTTPPort int) {
if rqliteHTTPPort == 0 {
return
}
status, err := rqlite.GetRaftStatus(rqliteHTTPPort)
if err != nil {
// rqlite not up / not reachable on this node — nothing to do.
return
}
if status.Store.Raft.State != "Leader" {
return // only the leader can transfer leadership away
}
selfID := status.Store.Raft.LeaderID
nodes, err := rqlite.GetRaftNodes(rqliteHTTPPort)
if err != nil {
return
}
peerRTTs, allVotersReachable := measurePeerRTTs(nodes, selfID)
target, transfer := decideLeadershipTransfer(
true, allVotersReachable, cm.leaderTransferCooldownElapsed(namespace),
peerRTTs, leaderLocalityRTTThreshold,
)
if !transfer {
return
}
cm.logger.Info("leader-locality: this node is an isolated namespace raft leader — transferring leadership to a co-located voter (bugboard #708)",
zap.String("namespace", namespace),
zap.String("from", selfID),
zap.String("to", target),
zap.Duration("target_rtt", peerRTTs[target]),
)
// Record the cooldown BEFORE the transfer so a slow/looping transfer can't
// re-fire on the next tick regardless of outcome.
cm.recordLeaderTransfer(namespace)
if err := rqlite.TransferLeadershipTo(rqliteHTTPPort, target, cm.logger); err != nil {
cm.logger.Warn("leader-locality: leadership transfer failed",
zap.String("namespace", namespace), zap.Error(err))
}
}