fix(namespace): don't silently disable TURN on unresolvable WebRTC secret (#130)

A freshly-joined namespace node could come up with TURN silently disabled
(turn.credentials -> namespace_not_configured) when GetWebRTCConfig errored:
the stored TURN shared secret was encrypted with a pre-rotation
cluster-secret-derived key and failed to decrypt, and the converge swallowed
that error into "WebRTC disabled", writing a TURN-disabled gateway config.

Distinguish "resolved but not enabled" (genuinely disabled, fine to write the
empty block) from "unresolved" (DB/decrypt error). chooseRestoreWebRTC's
dbFetch callback now returns a `resolved` bool; an unresolved lookup forces
enabled=false AND sets restoreWebRTC.unresolved. The converge then:
  - logs the decrypt/read error loudly with the exact remediation
    (`orama namespace disable webrtc` then `enable webrtc`) instead of
    swallowing it;
  - on the warm path, SKIPS ReconcileGateway so it doesn't rewrite a running
    gateway's working WebRTC block to empty (preserves TURN);
  - on the cold path, still spawns the gateway (the namespace needs one) but
    warns loudly that TURN is degraded until the secret is regenerated.

Healthy nodes are unaffected: a node whose state file holds the secret
short-circuits before dbFetch, so a flaky/rotated DB cannot disable it.
Dual-reviewed (code-quality APPROVED, security SECURE — no secret material is
logged; operator disable still resolves to disabled, not unresolved).

Pure-function coverage in restore_webrtc_test.go: unresolved marking,
resolved-empty-is-disabled, and state-secret-wins-over-db-error.
This commit is contained in:
anonpenguin23 2026-06-12 16:44:25 +03:00
parent 021d362b2f
commit e7ed718965
2 changed files with 146 additions and 29 deletions

View File

@ -1844,6 +1844,13 @@ type restoreWebRTC struct {
turnDomain string
turnSecret string
stealthDomain string // feat-124: empty when webrtc stealth is disabled
// unresolved is true when the state file had no TURN secret AND the DB
// fallback ERRORED (vs. resolved-but-not-enabled). The caller must NOT
// write a WebRTC-disabled gateway config off an unresolved lookup — that
// silently kills turn.credentials on a node that should serve TURN
// (bugboard #130: a decrypt failure after cluster-secret rotation was
// swallowed into "disabled"). enabled is always false when unresolved.
unresolved bool
}
// chooseRestoreWebRTC resolves a restored gateway's WebRTC config. TWO
@ -1869,7 +1876,7 @@ type restoreWebRTC struct {
// standing up the full restore path (systemd spawner + DB + port store).
func chooseRestoreWebRTC(
stateHasSFU bool, stateSFUPort int, stateTURNDomain, stateTURNSecret, stateStealthDomain string,
dbFetch func() (turnSecret, turnDomain, stealthDomain string, sfuPort int),
dbFetch func() (turnSecret, turnDomain, stealthDomain string, sfuPort int, resolved bool),
) restoreWebRTC {
turnSecret := stateTURNSecret
turnDomain := stateTURNDomain
@ -1885,8 +1892,18 @@ func chooseRestoreWebRTC(
// the state file was written reaches here with an empty secret.
// (Stealth toggles DO rewrite cluster state on every node, so the
// state-first read stays fresh for stealthDomain too.)
unresolved := false
if turnSecret == "" {
if dbSecret, dbDomain, dbStealth, dbSFU := dbFetch(); dbSecret != "" {
dbSecret, dbDomain, dbStealth, dbSFU, resolved := dbFetch()
switch {
case !resolved:
// The DB/decrypt lookup ERRORED — we do not know whether WebRTC
// is enabled. This is DISTINCT from resolved-but-empty (genuinely
// disabled). Signal unresolved so the caller preserves the
// running config instead of writing a TURN-disabled one
// (bugboard #130).
unresolved = true
case dbSecret != "":
turnSecret = dbSecret
if turnDomain == "" {
turnDomain = dbDomain
@ -1901,7 +1918,8 @@ func chooseRestoreWebRTC(
}
return restoreWebRTC{
enabled: turnSecret != "" || sfuPort > 0,
enabled: !unresolved && (turnSecret != "" || sfuPort > 0),
unresolved: unresolved,
sfuPort: sfuPort,
turnDomain: turnDomain,
turnSecret: turnSecret,
@ -2062,10 +2080,25 @@ func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *Cl
// file is incomplete.
wr := chooseRestoreWebRTC(
state.HasSFU, state.SFUSignalingPort, state.TURNDomain, state.TURNSharedSecret, state.TURNStealthDomain,
func() (turnSecret, turnDomain, stealthDomain string, sfuPort int) {
func() (turnSecret, turnDomain, stealthDomain string, sfuPort int, resolved bool) {
webrtcCfg, err := cm.GetWebRTCConfig(ctx, state.NamespaceName)
if err != nil || webrtcCfg == nil {
return "", "", "", 0
if err != nil {
// Do NOT swallow this into "disabled". A decrypt failure
// (stale cluster-secret-derived key after rotation) or a
// transient read error would otherwise silently disable
// TURN on this node — turn.credentials then returns
// namespace_not_configured (bugboard #130). Surface it
// loudly and signal unresolved so the caller preserves the
// running config.
cm.logger.Error("WebRTC TURN secret unresolvable on this node — refusing to silently disable TURN; preserving existing gateway config. Likely a cluster-secret rotation; regenerate with `orama namespace disable webrtc` then `orama namespace enable webrtc`.",
zap.String("namespace", state.NamespaceName),
zap.String("node_id", cm.localNodeID),
zap.Error(err))
return "", "", "", 0, false
}
if webrtcCfg == nil {
// Resolved cleanly: the namespace genuinely has no WebRTC.
return "", "", "", 0, true
}
// TURN is namespace-wide; SFU port is per-node and may be
// absent on a gateway-only (non-SFU) node — that's fine,
@ -2077,7 +2110,7 @@ func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *Cl
return webrtcCfg.TURNSharedSecret,
fmt.Sprintf("turn.ns-%s.%s", state.NamespaceName, cm.baseDomain),
cm.stealthDomainFor(state.NamespaceName, webrtcCfg),
sfu
sfu, true
},
)
if wr.enabled {
@ -2094,20 +2127,48 @@ func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *Cl
resp, err := http.Get(fmt.Sprintf("http://localhost:%d/v1/health", pb.GatewayHTTPPort))
if err == nil {
resp.Body.Close()
// Gateway is already up. Reconcile config drift (bugboard #25 —
// the WARM case): if the running gateway's on-disk config has a
// WebRTC block that differs from the desired (e.g. it lost the
// block on a prior restart where it stayed healthy and the
// cold-spawn path below never ran), rewrite the config + restart.
// ReconcileGateway is a no-op when the on-disk block already
// matches, so this does NOT cause a restart loop on every boot.
if rerr := cm.systemdSpawner.ReconcileGateway(ctx, state.NamespaceName, cm.localNodeID, gwCfg); rerr != nil {
cm.logger.Warn("Gateway WebRTC reconcile failed (leaving running config as-is)",
zap.String("namespace", state.NamespaceName), zap.Error(rerr))
switch {
case wr.unresolved:
// Bugboard #130 guard: the WebRTC secret could not be resolved
// (DB/decrypt error, logged above). The gateway is already up
// and may be serving TURN from a valid on-disk secret — do NOT
// reconcile it to the empty/disabled block we'd otherwise
// build, which would kill turn.credentials on this node. Leave
// the running config untouched; the operator regenerates the
// secret.
//
// Note: this also defers ReconcileGateway's #837
// secrets-encryption-key reconcile for this one converge pass.
// That is acceptable — the operator action that fixes the
// unresolved TURN secret (regenerate + restart) re-runs the
// full reconcile, and pre-fix this path would have corrupted
// the WebRTC block anyway.
cm.logger.Error("Gateway up but WebRTC secret unresolved — skipping reconcile to avoid disabling TURN on the running config (bugboard #130)",
zap.String("namespace", state.NamespaceName))
default:
// Gateway is already up. Reconcile config drift (bugboard #25 —
// the WARM case): if the running gateway's on-disk config has a
// WebRTC block that differs from the desired (e.g. it lost the
// block on a prior restart where it stayed healthy and the
// cold-spawn path below never ran), rewrite the config +
// restart. ReconcileGateway is a no-op when the on-disk block
// already matches, so this does NOT cause a restart loop.
if rerr := cm.systemdSpawner.ReconcileGateway(ctx, state.NamespaceName, cm.localNodeID, gwCfg); rerr != nil {
cm.logger.Warn("Gateway WebRTC reconcile failed (leaving running config as-is)",
zap.String("namespace", state.NamespaceName), zap.Error(rerr))
}
}
} else {
// Gateway is down → cold spawn with the resolved config.
if wr.enabled && !state.HasSFU {
// Gateway is down → cold spawn. We must bring a gateway up
// regardless (the namespace needs one); but if the WebRTC secret
// was unresolved we can't write a working TURN block, so warn
// loudly that TURN is degraded on this node until the secret is
// regenerated (bugboard #130).
switch {
case wr.unresolved:
cm.logger.Error("Cold-spawning gateway with TURN UNAVAILABLE — WebRTC secret unresolved on this node; turn.credentials will return namespace_not_configured until it is regenerated (`orama namespace disable webrtc` then `orama namespace enable webrtc`)",
zap.String("namespace", state.NamespaceName))
case wr.enabled && !state.HasSFU:
cm.logger.Info("Re-materialized WebRTC gateway config from DB (state file was stale)",
zap.String("namespace", state.NamespaceName),
zap.Int("sfu_port", wr.sfuPort))

View File

@ -11,11 +11,16 @@ import "testing"
// port is per-node (0 on a gateway-only node). Pins both the drift
// fallback and the non-SFU-gateway case.
// dbFetch signature: () -> (turnSecret, turnDomain, stealthDomain string, sfuPort int).
func dbNone() (string, string, string, int) { return "", "", "", 0 }
// dbFetch signature: () -> (turnSecret, turnDomain, stealthDomain string, sfuPort int, resolved bool).
// resolved=true means the lookup completed (with or without a config);
// resolved=false means it ERRORED (e.g. decrypt failure) → unresolved.
func dbNone() (string, string, string, int, bool) { return "", "", "", 0, true }
func dbFull(secret, domain string, sfuPort int) func() (string, string, string, int) {
return func() (string, string, string, int) { return secret, domain, "", sfuPort }
// dbError models a DB/decrypt failure: the lookup did not complete.
func dbError() (string, string, string, int, bool) { return "", "", "", 0, false }
func dbFull(secret, domain string, sfuPort int) func() (string, string, string, int, bool) {
return func() (string, string, string, int, bool) { return secret, domain, "", sfuPort, true }
}
func TestChooseRestoreWebRTC_stateFileCompleteWins(t *testing.T) {
@ -24,7 +29,7 @@ func TestChooseRestoreWebRTC_stateFileCompleteWins(t *testing.T) {
// restart path).
dbCalled := false
got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "",
func() (string, string, string, int) { dbCalled = true; return dbNone() })
func() (string, string, string, int, bool) { dbCalled = true; return dbNone() })
if dbCalled {
t.Error("DB fetch was called even though the state file had the TURN secret (should short-circuit)")
@ -85,7 +90,7 @@ func TestChooseRestoreWebRTC_stateHasTURNButNoSFU(t *testing.T) {
// NOT consult the DB (TURN secret present = complete enough).
dbCalled := false
got := chooseRestoreWebRTC(false, 0, "turn.ns-x.dbrs.space", "state-secret", "",
func() (string, string, string, int) { dbCalled = true; return dbNone() })
func() (string, string, string, int, bool) { dbCalled = true; return dbNone() })
if dbCalled {
t.Error("DB fetch called even though state file had the TURN secret")
@ -110,7 +115,7 @@ func TestChooseRestoreWebRTC_dbNoSecretStaysDisabled(t *testing.T) {
// enablement marker; without it we treat it as not-configured-for-
// TURN, but an SFU port alone still enables SFU routes.
got := chooseRestoreWebRTC(false, 0, "", "", "",
func() (string, string, string, int) { return "", "turn.db", "", 9000 })
func() (string, string, string, int, bool) { return "", "turn.db", "", 9000, true })
// dbFetch only runs when state secret is empty; here it returns no
// secret, so the `if dbSecret != ""` guard means NOTHING is taken
// from the DB → disabled. (An SFU-only-no-TURN namespace is not a
@ -126,7 +131,7 @@ func TestChooseRestoreWebRTC_stealthFromStateFile(t *testing.T) {
// Stealth toggles rewrite cluster state, so a fresh state file carries
// the stealth domain and must win without a DB call.
got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "cdn-abc123def456.dbrs.space",
func() (string, string, string, int) {
func() (string, string, string, int, bool) {
t.Error("DB fetch called even though state file was complete")
return dbNone()
})
@ -139,14 +144,65 @@ func TestChooseRestoreWebRTC_stealthFromDBOnStaleState(t *testing.T) {
// Stale state (no TURN secret) + DB has stealth enabled → stealth domain
// re-materializes from the DB alongside the rest of the WebRTC block.
got := chooseRestoreWebRTC(false, 0, "", "", "",
func() (string, string, string, int) {
return "db-secret", "turn.ns-x.dbrs.space", "cdn-abc123def456.dbrs.space", 7801
func() (string, string, string, int, bool) {
return "db-secret", "turn.ns-x.dbrs.space", "cdn-abc123def456.dbrs.space", 7801, true
})
if !got.enabled || got.stealthDomain != "cdn-abc123def456.dbrs.space" {
t.Errorf("want stealth domain from DB on stale state; got %+v", got)
}
}
// --- bugboard #130: distinguish "unresolved (DB/decrypt error)" from "disabled" ---
func TestChooseRestoreWebRTC_dbErrorMarksUnresolvedNotDisabled(t *testing.T) {
// The bug-130 case: state file has no secret (freshly-joined node) and
// the DB lookup ERRORS (e.g. the stored TURN secret can't be decrypted
// after a cluster-secret rotation). This MUST surface as unresolved —
// NOT as a clean "disabled" — so the caller preserves the running config
// instead of writing a TURN-disabled gateway (which made turn.credentials
// return namespace_not_configured).
got := chooseRestoreWebRTC(false, 0, "", "", "", dbError)
if !got.unresolved {
t.Fatal("BUG #130 REGRESSION: a DB/decrypt error must mark the result unresolved")
}
if got.enabled {
t.Errorf("unresolved result must never be enabled (would write a config off an errored lookup); got %+v", got)
}
if got.turnSecret != "" {
t.Errorf("unresolved result must carry no secret; got %q", got.turnSecret)
}
}
func TestChooseRestoreWebRTC_resolvedEmptyIsDisabledNotUnresolved(t *testing.T) {
// The contrast case: the DB lookup COMPLETES and reports no WebRTC
// (genuinely disabled namespace). This must be disabled, NOT unresolved —
// the caller is free to write the empty/disabled config here.
got := chooseRestoreWebRTC(false, 0, "", "", "", dbNone)
if got.unresolved {
t.Error("a clean resolved-but-empty lookup must NOT be marked unresolved")
}
if got.enabled {
t.Errorf("genuinely-disabled namespace must be disabled; got %+v", got)
}
}
func TestChooseRestoreWebRTC_stateSecretWinsOverDBError(t *testing.T) {
// A node that already holds the TURN secret in its state file must NOT be
// affected by a DB error — it short-circuits before dbFetch and stays
// enabled/resolved. Guards against the #130 fix accidentally disabling
// healthy nodes when the DB is flaky.
got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "",
func() (string, string, string, int, bool) {
t.Error("DB fetch must not be called when the state file has the secret")
return dbError()
})
if got.unresolved || !got.enabled || got.turnSecret != "state-secret" {
t.Errorf("state-file secret must win and stay enabled/resolved; got %+v", got)
}
}
func TestChooseRestoreWebRTC_noStealthStaysEmpty(t *testing.T) {
// Stealth disabled everywhere → empty stealthDomain (gateway advertises
// the baseline 3-rung ladder only).