mirror of
https://github.com/DeBrosOfficial/orama.git
synced 2026-06-16 22:54:12 +00:00
fix(namespace): make WebRTC config survive slow/cold node restarts (#130)
Root cause of the recurring "turn.credentials → namespace_not_configured" on a
distant node: at converge the gateway resolves its TURN secret from the
namespace rqlite, and on a slow/just-restarted node that read fails ONCE, so
the gateway is written with TURN disabled. Removing the node is not a fix — the
software must tolerate a slow read.
Two-part fix (complements e7ed718's "don't blank a warm config"):
- RETRY the secret read (5×2s) at converge so a node whose rqlite is still
syncing waits for it to land instead of writing an empty block once. A
genuine decrypt failure still exhausts the retries → unresolved → the
running config is preserved.
- CACHE the resolved secret into the node's own cluster-state.json
(applyResolvedWebRTCToState), so the NEXT cold start reads it from disk —
chooseRestoreWebRTC is state-first and short-circuits before the DB. The
state struct already had TURNSharedSecret "for cold start" but nothing
populated it; now it's filled on every successful resolve (only rewritten
on change). Each node self-heals its own cache; nothing new is sent
cross-node.
cluster-state.json now carries the TURN secret, so both writers (local
saveLocalState and the remote SaveClusterState) are tightened to 0600 + chmod.
Stale-secret self-heals: disable/enable webrtc re-pushes every node's config
and the next converge re-caches the new value.
Dual-reviewed: code-quality APPROVED; security SECURE after the remote-write
0600 fix. Tests: cache populate + short-circuit, no-change, turn-only node.
This commit is contained in:
parent
66db54c094
commit
2b184f0398
@ -1785,9 +1785,16 @@ func (cm *ClusterManager) saveLocalState(state *ClusterLocalState) error {
|
||||
return fmt.Errorf("failed to marshal state: %w", err)
|
||||
}
|
||||
path := filepath.Join(dir, "cluster-state.json")
|
||||
if err := os.WriteFile(path, data, 0644); err != nil {
|
||||
// 0600: this file now carries the namespace TURN shared secret for
|
||||
// cold-start resilience (bugboard #130), so it must not be world/group
|
||||
// readable. WriteFile's mode only applies on create — chmod explicitly so a
|
||||
// file written 0644 by an older release is tightened on the next rewrite.
|
||||
if err := os.WriteFile(path, data, 0600); err != nil {
|
||||
return fmt.Errorf("failed to write state file: %w", err)
|
||||
}
|
||||
if err := os.Chmod(path, 0600); err != nil {
|
||||
return fmt.Errorf("failed to set state file permissions: %w", err)
|
||||
}
|
||||
cm.logger.Info("Saved cluster local state", zap.String("namespace", state.NamespaceName), zap.String("path", path))
|
||||
return nil
|
||||
}
|
||||
@ -1838,6 +1845,41 @@ func (cm *ClusterManager) RestoreLocalClustersFromDisk(ctx context.Context) (int
|
||||
|
||||
// restoreWebRTC is the resolved WebRTC gateway config for a restored
|
||||
// namespace gateway.
|
||||
const (
|
||||
// webrtcResolveRetries / webrtcResolveRetryDelay bound how long the converge
|
||||
// waits for a slow/just-restarted node's namespace rqlite to become readable
|
||||
// before giving up on the WebRTC secret. A distant node (high WG RTT) can
|
||||
// take a few seconds to sync; without this it reads empty once and comes up
|
||||
// with TURN disabled (bugboard #130). 5 × 2s = 10s ceiling on the cold path.
|
||||
webrtcResolveRetries = 5
|
||||
webrtcResolveRetryDelay = 2 * time.Second
|
||||
)
|
||||
|
||||
// applyResolvedWebRTCToState copies a freshly-resolved WebRTC config into the
|
||||
// local cluster state so a future cold start can read the TURN secret from disk
|
||||
// instead of the (possibly-slow) namespace rqlite (bugboard #130). Returns true
|
||||
// iff the state changed, so the caller only rewrites the on-disk file when
|
||||
// there's something to persist. Pure — unit-testable without a live cluster.
|
||||
func applyResolvedWebRTCToState(state *ClusterLocalState, wr restoreWebRTC) bool {
|
||||
hasTURN := wr.turnSecret != ""
|
||||
hasSFU := wr.sfuPort > 0
|
||||
if state.TURNSharedSecret == wr.turnSecret &&
|
||||
state.TURNDomain == wr.turnDomain &&
|
||||
state.TURNStealthDomain == wr.stealthDomain &&
|
||||
state.SFUSignalingPort == wr.sfuPort &&
|
||||
state.HasTURN == hasTURN &&
|
||||
state.HasSFU == hasSFU {
|
||||
return false
|
||||
}
|
||||
state.HasTURN = hasTURN
|
||||
state.HasSFU = hasSFU
|
||||
state.TURNSharedSecret = wr.turnSecret
|
||||
state.TURNDomain = wr.turnDomain
|
||||
state.TURNStealthDomain = wr.stealthDomain
|
||||
state.SFUSignalingPort = wr.sfuPort
|
||||
return true
|
||||
}
|
||||
|
||||
type restoreWebRTC struct {
|
||||
enabled bool
|
||||
sfuPort int
|
||||
@ -2081,18 +2123,35 @@ func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *Cl
|
||||
wr := chooseRestoreWebRTC(
|
||||
state.HasSFU, state.SFUSignalingPort, state.TURNDomain, state.TURNSharedSecret, state.TURNStealthDomain,
|
||||
func() (turnSecret, turnDomain, stealthDomain string, sfuPort int, resolved bool) {
|
||||
webrtcCfg, err := cm.GetWebRTCConfig(ctx, state.NamespaceName)
|
||||
// Retry the read on a transient error. A distant/slow node's
|
||||
// namespace rqlite may not be synced/readable yet at cold-start
|
||||
// converge time — without the retry the read fails once and the
|
||||
// gateway is written with TURN disabled (bugboard #130). The
|
||||
// secret IS in the DB; we just need the read to land once the
|
||||
// follower catches up (typically a few seconds). A genuine
|
||||
// decrypt failure (stale key) also errors here and will exhaust
|
||||
// the retries → unresolved → the caller preserves the running
|
||||
// config rather than blanking it.
|
||||
var webrtcCfg *WebRTCConfig
|
||||
var err error
|
||||
for attempt := 0; attempt < webrtcResolveRetries; attempt++ {
|
||||
webrtcCfg, err = cm.GetWebRTCConfig(ctx, state.NamespaceName)
|
||||
if err == nil {
|
||||
break // success — webrtcCfg may be nil (genuinely disabled)
|
||||
}
|
||||
if attempt < webrtcResolveRetries-1 {
|
||||
time.Sleep(webrtcResolveRetryDelay)
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
// Do NOT swallow this into "disabled". A decrypt failure
|
||||
// (stale cluster-secret-derived key after rotation) or a
|
||||
// transient read error would otherwise silently disable
|
||||
// TURN on this node — turn.credentials then returns
|
||||
// namespace_not_configured (bugboard #130). Surface it
|
||||
// loudly and signal unresolved so the caller preserves the
|
||||
// running config.
|
||||
cm.logger.Error("WebRTC TURN secret unresolvable on this node — refusing to silently disable TURN; preserving existing gateway config. Likely a cluster-secret rotation; regenerate with `orama namespace disable webrtc` then `orama namespace enable webrtc`.",
|
||||
// Persistent error after retries (slow read that never
|
||||
// landed, or a decrypt failure). Do NOT swallow into
|
||||
// "disabled" — surface loudly and signal unresolved so the
|
||||
// caller preserves the running config (bugboard #130).
|
||||
cm.logger.Error("WebRTC TURN secret unresolvable on this node after retries — refusing to silently disable TURN; preserving existing gateway config. If this is a cluster-secret rotation, regenerate with `orama namespace disable webrtc` then `orama namespace enable webrtc`.",
|
||||
zap.String("namespace", state.NamespaceName),
|
||||
zap.String("node_id", cm.localNodeID),
|
||||
zap.Int("attempts", webrtcResolveRetries),
|
||||
zap.Error(err))
|
||||
return "", "", "", 0, false
|
||||
}
|
||||
@ -2122,6 +2181,23 @@ func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *Cl
|
||||
gwCfg.TURNDomain = wr.turnDomain
|
||||
gwCfg.TURNSecret = wr.turnSecret
|
||||
gwCfg.TURNStealthDomain = wr.stealthDomain
|
||||
|
||||
// Cache the resolved secret into THIS node's local state so the
|
||||
// NEXT cold start reads it from disk (state-first in
|
||||
// chooseRestoreWebRTC short-circuits before the DB) instead of
|
||||
// depending on a live, possibly-slow namespace-rqlite read — which
|
||||
// is exactly what left a distant/slow node's gateway with TURN
|
||||
// disabled on restart (bugboard #130). Each node self-heals its own
|
||||
// cache on a successful resolve; nothing is sent cross-node.
|
||||
if applyResolvedWebRTCToState(state, wr) {
|
||||
if err := cm.saveLocalState(state); err != nil {
|
||||
cm.logger.Warn("Failed to cache resolved WebRTC config to local state (cold start may fall back to the DB read next boot)",
|
||||
zap.String("namespace", state.NamespaceName), zap.Error(err))
|
||||
} else {
|
||||
cm.logger.Info("Cached resolved WebRTC config to local state for cold-start resilience (bugboard #130)",
|
||||
zap.String("namespace", state.NamespaceName))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resp, err := http.Get(fmt.Sprintf("http://localhost:%d/v1/health", pb.GatewayHTTPPort))
|
||||
|
||||
@ -211,3 +211,56 @@ func TestChooseRestoreWebRTC_noStealthStaysEmpty(t *testing.T) {
|
||||
t.Errorf("stealthDomain = %q; want empty when stealth is disabled", got.stealthDomain)
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Bugboard #130 — cache the resolved WebRTC secret into local state so a slow
|
||||
// node's cold start reads it from disk instead of the (slow) namespace rqlite.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
func TestApplyResolvedWebRTCToState_populatesAndReportsChange(t *testing.T) {
|
||||
st := &ClusterLocalState{} // fresh node: no cached secret (the #130 gap)
|
||||
wr := restoreWebRTC{enabled: true, turnSecret: "sek-123", turnDomain: "turn.ns-x.dbrs.space", stealthDomain: "cdn-abc.dbrs.space", sfuPort: 30000}
|
||||
|
||||
if !applyResolvedWebRTCToState(st, wr) {
|
||||
t.Fatal("expected change=true when caching a secret into empty state")
|
||||
}
|
||||
if st.TURNSharedSecret != "sek-123" {
|
||||
t.Errorf("TURNSharedSecret = %q; want sek-123 (must be cached for cold start)", st.TURNSharedSecret)
|
||||
}
|
||||
if !st.HasTURN || !st.HasSFU || st.SFUSignalingPort != 30000 ||
|
||||
st.TURNDomain != "turn.ns-x.dbrs.space" || st.TURNStealthDomain != "cdn-abc.dbrs.space" {
|
||||
t.Errorf("state not fully populated: %+v", st)
|
||||
}
|
||||
|
||||
// The whole point: a SECOND boot now reads the secret from state and must
|
||||
// NOT consult the DB (chooseRestoreWebRTC short-circuits).
|
||||
dbCalled := false
|
||||
got := chooseRestoreWebRTC(st.HasSFU, st.SFUSignalingPort, st.TURNDomain, st.TURNSharedSecret, st.TURNStealthDomain,
|
||||
func() (string, string, string, int, bool) { dbCalled = true; return dbError() })
|
||||
if dbCalled {
|
||||
t.Error("BUG #130: cold start still hit the DB even though the secret was cached in local state")
|
||||
}
|
||||
if !got.enabled || got.unresolved || got.turnSecret != "sek-123" {
|
||||
t.Errorf("cached cold start should resolve enabled from state; got %+v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyResolvedWebRTCToState_noChangeWhenAlreadyCached(t *testing.T) {
|
||||
st := &ClusterLocalState{HasTURN: true, HasSFU: true, TURNSharedSecret: "sek-123", TURNDomain: "d", TURNStealthDomain: "s", SFUSignalingPort: 30000}
|
||||
wr := restoreWebRTC{enabled: true, turnSecret: "sek-123", turnDomain: "d", stealthDomain: "s", sfuPort: 30000}
|
||||
if applyResolvedWebRTCToState(st, wr) {
|
||||
t.Error("expected change=false (no rewrite) when state already matches the resolved config")
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyResolvedWebRTCToState_turnOnlyNode_noSFU(t *testing.T) {
|
||||
// A gateway-only node (serves TURN credentials, runs no local SFU): secret
|
||||
// set, sfuPort 0. Must still cache the secret + report HasTURN, HasSFU=false.
|
||||
st := &ClusterLocalState{}
|
||||
if !applyResolvedWebRTCToState(st, restoreWebRTC{enabled: true, turnSecret: "sek", turnDomain: "d", sfuPort: 0}) {
|
||||
t.Fatal("want change=true")
|
||||
}
|
||||
if !st.HasTURN || st.HasSFU || st.TURNSharedSecret != "sek" {
|
||||
t.Errorf("turn-only node: want HasTURN=true HasSFU=false secret cached; got %+v", st)
|
||||
}
|
||||
}
|
||||
|
||||
@ -801,9 +801,16 @@ func (s *SystemdSpawner) SaveClusterState(namespace string, data []byte) error {
|
||||
return fmt.Errorf("failed to create namespace dir: %w", err)
|
||||
}
|
||||
path := filepath.Join(dir, "cluster-state.json")
|
||||
if err := os.WriteFile(path, data, 0644); err != nil {
|
||||
// 0600 + chmod: cluster-state.json carries the namespace TURN shared secret
|
||||
// for cold-start resilience (bugboard #130), so it must not be world/group
|
||||
// readable on the receiving node either. WriteFile's mode only applies on
|
||||
// create, so chmod explicitly to tighten a file an older release wrote 0644.
|
||||
if err := os.WriteFile(path, data, 0600); err != nil {
|
||||
return fmt.Errorf("failed to write cluster state: %w", err)
|
||||
}
|
||||
if err := os.Chmod(path, 0600); err != nil {
|
||||
return fmt.Errorf("failed to set cluster state permissions: %w", err)
|
||||
}
|
||||
s.logger.Info("Saved cluster state from coordinator",
|
||||
zap.String("namespace", namespace),
|
||||
zap.String("path", path))
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user