mirror of
https://github.com/DeBrosOfficial/orama.git
synced 2026-06-16 23:54:13 +00:00
fix(namespace): don't silently disable TURN on unresolvable WebRTC secret (#130)
A freshly-joined namespace node could come up with TURN silently disabled
(turn.credentials -> namespace_not_configured) when GetWebRTCConfig errored:
the stored TURN shared secret was encrypted with a pre-rotation
cluster-secret-derived key and failed to decrypt, and the converge swallowed
that error into "WebRTC disabled", writing a TURN-disabled gateway config.
Distinguish "resolved but not enabled" (genuinely disabled, fine to write the
empty block) from "unresolved" (DB/decrypt error). chooseRestoreWebRTC's
dbFetch callback now returns a `resolved` bool; an unresolved lookup forces
enabled=false AND sets restoreWebRTC.unresolved. The converge then:
- logs the decrypt/read error loudly with the exact remediation
(`orama namespace disable webrtc` then `enable webrtc`) instead of
swallowing it;
- on the warm path, SKIPS ReconcileGateway so it doesn't rewrite a running
gateway's working WebRTC block to empty (preserves TURN);
- on the cold path, still spawns the gateway (the namespace needs one) but
warns loudly that TURN is degraded until the secret is regenerated.
Healthy nodes are unaffected: a node whose state file holds the secret
short-circuits before dbFetch, so a flaky/rotated DB cannot disable it.
Dual-reviewed (code-quality APPROVED, security SECURE — no secret material is
logged; operator disable still resolves to disabled, not unresolved).
Pure-function coverage in restore_webrtc_test.go: unresolved marking,
resolved-empty-is-disabled, and state-secret-wins-over-db-error.
This commit is contained in:
parent
021d362b2f
commit
e7ed718965
@ -1844,6 +1844,13 @@ type restoreWebRTC struct {
|
||||
turnDomain string
|
||||
turnSecret string
|
||||
stealthDomain string // feat-124: empty when webrtc stealth is disabled
|
||||
// unresolved is true when the state file had no TURN secret AND the DB
|
||||
// fallback ERRORED (vs. resolved-but-not-enabled). The caller must NOT
|
||||
// write a WebRTC-disabled gateway config off an unresolved lookup — that
|
||||
// silently kills turn.credentials on a node that should serve TURN
|
||||
// (bugboard #130: a decrypt failure after cluster-secret rotation was
|
||||
// swallowed into "disabled"). enabled is always false when unresolved.
|
||||
unresolved bool
|
||||
}
|
||||
|
||||
// chooseRestoreWebRTC resolves a restored gateway's WebRTC config. TWO
|
||||
@ -1869,7 +1876,7 @@ type restoreWebRTC struct {
|
||||
// standing up the full restore path (systemd spawner + DB + port store).
|
||||
func chooseRestoreWebRTC(
|
||||
stateHasSFU bool, stateSFUPort int, stateTURNDomain, stateTURNSecret, stateStealthDomain string,
|
||||
dbFetch func() (turnSecret, turnDomain, stealthDomain string, sfuPort int),
|
||||
dbFetch func() (turnSecret, turnDomain, stealthDomain string, sfuPort int, resolved bool),
|
||||
) restoreWebRTC {
|
||||
turnSecret := stateTURNSecret
|
||||
turnDomain := stateTURNDomain
|
||||
@ -1885,8 +1892,18 @@ func chooseRestoreWebRTC(
|
||||
// the state file was written reaches here with an empty secret.
|
||||
// (Stealth toggles DO rewrite cluster state on every node, so the
|
||||
// state-first read stays fresh for stealthDomain too.)
|
||||
unresolved := false
|
||||
if turnSecret == "" {
|
||||
if dbSecret, dbDomain, dbStealth, dbSFU := dbFetch(); dbSecret != "" {
|
||||
dbSecret, dbDomain, dbStealth, dbSFU, resolved := dbFetch()
|
||||
switch {
|
||||
case !resolved:
|
||||
// The DB/decrypt lookup ERRORED — we do not know whether WebRTC
|
||||
// is enabled. This is DISTINCT from resolved-but-empty (genuinely
|
||||
// disabled). Signal unresolved so the caller preserves the
|
||||
// running config instead of writing a TURN-disabled one
|
||||
// (bugboard #130).
|
||||
unresolved = true
|
||||
case dbSecret != "":
|
||||
turnSecret = dbSecret
|
||||
if turnDomain == "" {
|
||||
turnDomain = dbDomain
|
||||
@ -1901,7 +1918,8 @@ func chooseRestoreWebRTC(
|
||||
}
|
||||
|
||||
return restoreWebRTC{
|
||||
enabled: turnSecret != "" || sfuPort > 0,
|
||||
enabled: !unresolved && (turnSecret != "" || sfuPort > 0),
|
||||
unresolved: unresolved,
|
||||
sfuPort: sfuPort,
|
||||
turnDomain: turnDomain,
|
||||
turnSecret: turnSecret,
|
||||
@ -2062,10 +2080,25 @@ func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *Cl
|
||||
// file is incomplete.
|
||||
wr := chooseRestoreWebRTC(
|
||||
state.HasSFU, state.SFUSignalingPort, state.TURNDomain, state.TURNSharedSecret, state.TURNStealthDomain,
|
||||
func() (turnSecret, turnDomain, stealthDomain string, sfuPort int) {
|
||||
func() (turnSecret, turnDomain, stealthDomain string, sfuPort int, resolved bool) {
|
||||
webrtcCfg, err := cm.GetWebRTCConfig(ctx, state.NamespaceName)
|
||||
if err != nil || webrtcCfg == nil {
|
||||
return "", "", "", 0
|
||||
if err != nil {
|
||||
// Do NOT swallow this into "disabled". A decrypt failure
|
||||
// (stale cluster-secret-derived key after rotation) or a
|
||||
// transient read error would otherwise silently disable
|
||||
// TURN on this node — turn.credentials then returns
|
||||
// namespace_not_configured (bugboard #130). Surface it
|
||||
// loudly and signal unresolved so the caller preserves the
|
||||
// running config.
|
||||
cm.logger.Error("WebRTC TURN secret unresolvable on this node — refusing to silently disable TURN; preserving existing gateway config. Likely a cluster-secret rotation; regenerate with `orama namespace disable webrtc` then `orama namespace enable webrtc`.",
|
||||
zap.String("namespace", state.NamespaceName),
|
||||
zap.String("node_id", cm.localNodeID),
|
||||
zap.Error(err))
|
||||
return "", "", "", 0, false
|
||||
}
|
||||
if webrtcCfg == nil {
|
||||
// Resolved cleanly: the namespace genuinely has no WebRTC.
|
||||
return "", "", "", 0, true
|
||||
}
|
||||
// TURN is namespace-wide; SFU port is per-node and may be
|
||||
// absent on a gateway-only (non-SFU) node — that's fine,
|
||||
@ -2077,7 +2110,7 @@ func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *Cl
|
||||
return webrtcCfg.TURNSharedSecret,
|
||||
fmt.Sprintf("turn.ns-%s.%s", state.NamespaceName, cm.baseDomain),
|
||||
cm.stealthDomainFor(state.NamespaceName, webrtcCfg),
|
||||
sfu
|
||||
sfu, true
|
||||
},
|
||||
)
|
||||
if wr.enabled {
|
||||
@ -2094,20 +2127,48 @@ func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *Cl
|
||||
resp, err := http.Get(fmt.Sprintf("http://localhost:%d/v1/health", pb.GatewayHTTPPort))
|
||||
if err == nil {
|
||||
resp.Body.Close()
|
||||
// Gateway is already up. Reconcile config drift (bugboard #25 —
|
||||
// the WARM case): if the running gateway's on-disk config has a
|
||||
// WebRTC block that differs from the desired (e.g. it lost the
|
||||
// block on a prior restart where it stayed healthy and the
|
||||
// cold-spawn path below never ran), rewrite the config + restart.
|
||||
// ReconcileGateway is a no-op when the on-disk block already
|
||||
// matches, so this does NOT cause a restart loop on every boot.
|
||||
if rerr := cm.systemdSpawner.ReconcileGateway(ctx, state.NamespaceName, cm.localNodeID, gwCfg); rerr != nil {
|
||||
cm.logger.Warn("Gateway WebRTC reconcile failed (leaving running config as-is)",
|
||||
zap.String("namespace", state.NamespaceName), zap.Error(rerr))
|
||||
switch {
|
||||
case wr.unresolved:
|
||||
// Bugboard #130 guard: the WebRTC secret could not be resolved
|
||||
// (DB/decrypt error, logged above). The gateway is already up
|
||||
// and may be serving TURN from a valid on-disk secret — do NOT
|
||||
// reconcile it to the empty/disabled block we'd otherwise
|
||||
// build, which would kill turn.credentials on this node. Leave
|
||||
// the running config untouched; the operator regenerates the
|
||||
// secret.
|
||||
//
|
||||
// Note: this also defers ReconcileGateway's #837
|
||||
// secrets-encryption-key reconcile for this one converge pass.
|
||||
// That is acceptable — the operator action that fixes the
|
||||
// unresolved TURN secret (regenerate + restart) re-runs the
|
||||
// full reconcile, and pre-fix this path would have corrupted
|
||||
// the WebRTC block anyway.
|
||||
cm.logger.Error("Gateway up but WebRTC secret unresolved — skipping reconcile to avoid disabling TURN on the running config (bugboard #130)",
|
||||
zap.String("namespace", state.NamespaceName))
|
||||
default:
|
||||
// Gateway is already up. Reconcile config drift (bugboard #25 —
|
||||
// the WARM case): if the running gateway's on-disk config has a
|
||||
// WebRTC block that differs from the desired (e.g. it lost the
|
||||
// block on a prior restart where it stayed healthy and the
|
||||
// cold-spawn path below never ran), rewrite the config +
|
||||
// restart. ReconcileGateway is a no-op when the on-disk block
|
||||
// already matches, so this does NOT cause a restart loop.
|
||||
if rerr := cm.systemdSpawner.ReconcileGateway(ctx, state.NamespaceName, cm.localNodeID, gwCfg); rerr != nil {
|
||||
cm.logger.Warn("Gateway WebRTC reconcile failed (leaving running config as-is)",
|
||||
zap.String("namespace", state.NamespaceName), zap.Error(rerr))
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Gateway is down → cold spawn with the resolved config.
|
||||
if wr.enabled && !state.HasSFU {
|
||||
// Gateway is down → cold spawn. We must bring a gateway up
|
||||
// regardless (the namespace needs one); but if the WebRTC secret
|
||||
// was unresolved we can't write a working TURN block, so warn
|
||||
// loudly that TURN is degraded on this node until the secret is
|
||||
// regenerated (bugboard #130).
|
||||
switch {
|
||||
case wr.unresolved:
|
||||
cm.logger.Error("Cold-spawning gateway with TURN UNAVAILABLE — WebRTC secret unresolved on this node; turn.credentials will return namespace_not_configured until it is regenerated (`orama namespace disable webrtc` then `orama namespace enable webrtc`)",
|
||||
zap.String("namespace", state.NamespaceName))
|
||||
case wr.enabled && !state.HasSFU:
|
||||
cm.logger.Info("Re-materialized WebRTC gateway config from DB (state file was stale)",
|
||||
zap.String("namespace", state.NamespaceName),
|
||||
zap.Int("sfu_port", wr.sfuPort))
|
||||
|
||||
@ -11,11 +11,16 @@ import "testing"
|
||||
// port is per-node (0 on a gateway-only node). Pins both the drift
|
||||
// fallback and the non-SFU-gateway case.
|
||||
|
||||
// dbFetch signature: () -> (turnSecret, turnDomain, stealthDomain string, sfuPort int).
|
||||
func dbNone() (string, string, string, int) { return "", "", "", 0 }
|
||||
// dbFetch signature: () -> (turnSecret, turnDomain, stealthDomain string, sfuPort int, resolved bool).
|
||||
// resolved=true means the lookup completed (with or without a config);
|
||||
// resolved=false means it ERRORED (e.g. decrypt failure) → unresolved.
|
||||
func dbNone() (string, string, string, int, bool) { return "", "", "", 0, true }
|
||||
|
||||
func dbFull(secret, domain string, sfuPort int) func() (string, string, string, int) {
|
||||
return func() (string, string, string, int) { return secret, domain, "", sfuPort }
|
||||
// dbError models a DB/decrypt failure: the lookup did not complete.
|
||||
func dbError() (string, string, string, int, bool) { return "", "", "", 0, false }
|
||||
|
||||
func dbFull(secret, domain string, sfuPort int) func() (string, string, string, int, bool) {
|
||||
return func() (string, string, string, int, bool) { return secret, domain, "", sfuPort, true }
|
||||
}
|
||||
|
||||
func TestChooseRestoreWebRTC_stateFileCompleteWins(t *testing.T) {
|
||||
@ -24,7 +29,7 @@ func TestChooseRestoreWebRTC_stateFileCompleteWins(t *testing.T) {
|
||||
// restart path).
|
||||
dbCalled := false
|
||||
got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "",
|
||||
func() (string, string, string, int) { dbCalled = true; return dbNone() })
|
||||
func() (string, string, string, int, bool) { dbCalled = true; return dbNone() })
|
||||
|
||||
if dbCalled {
|
||||
t.Error("DB fetch was called even though the state file had the TURN secret (should short-circuit)")
|
||||
@ -85,7 +90,7 @@ func TestChooseRestoreWebRTC_stateHasTURNButNoSFU(t *testing.T) {
|
||||
// NOT consult the DB (TURN secret present = complete enough).
|
||||
dbCalled := false
|
||||
got := chooseRestoreWebRTC(false, 0, "turn.ns-x.dbrs.space", "state-secret", "",
|
||||
func() (string, string, string, int) { dbCalled = true; return dbNone() })
|
||||
func() (string, string, string, int, bool) { dbCalled = true; return dbNone() })
|
||||
|
||||
if dbCalled {
|
||||
t.Error("DB fetch called even though state file had the TURN secret")
|
||||
@ -110,7 +115,7 @@ func TestChooseRestoreWebRTC_dbNoSecretStaysDisabled(t *testing.T) {
|
||||
// enablement marker; without it we treat it as not-configured-for-
|
||||
// TURN, but an SFU port alone still enables SFU routes.
|
||||
got := chooseRestoreWebRTC(false, 0, "", "", "",
|
||||
func() (string, string, string, int) { return "", "turn.db", "", 9000 })
|
||||
func() (string, string, string, int, bool) { return "", "turn.db", "", 9000, true })
|
||||
// dbFetch only runs when state secret is empty; here it returns no
|
||||
// secret, so the `if dbSecret != ""` guard means NOTHING is taken
|
||||
// from the DB → disabled. (An SFU-only-no-TURN namespace is not a
|
||||
@ -126,7 +131,7 @@ func TestChooseRestoreWebRTC_stealthFromStateFile(t *testing.T) {
|
||||
// Stealth toggles rewrite cluster state, so a fresh state file carries
|
||||
// the stealth domain and must win without a DB call.
|
||||
got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "cdn-abc123def456.dbrs.space",
|
||||
func() (string, string, string, int) {
|
||||
func() (string, string, string, int, bool) {
|
||||
t.Error("DB fetch called even though state file was complete")
|
||||
return dbNone()
|
||||
})
|
||||
@ -139,14 +144,65 @@ func TestChooseRestoreWebRTC_stealthFromDBOnStaleState(t *testing.T) {
|
||||
// Stale state (no TURN secret) + DB has stealth enabled → stealth domain
|
||||
// re-materializes from the DB alongside the rest of the WebRTC block.
|
||||
got := chooseRestoreWebRTC(false, 0, "", "", "",
|
||||
func() (string, string, string, int) {
|
||||
return "db-secret", "turn.ns-x.dbrs.space", "cdn-abc123def456.dbrs.space", 7801
|
||||
func() (string, string, string, int, bool) {
|
||||
return "db-secret", "turn.ns-x.dbrs.space", "cdn-abc123def456.dbrs.space", 7801, true
|
||||
})
|
||||
if !got.enabled || got.stealthDomain != "cdn-abc123def456.dbrs.space" {
|
||||
t.Errorf("want stealth domain from DB on stale state; got %+v", got)
|
||||
}
|
||||
}
|
||||
|
||||
// --- bugboard #130: distinguish "unresolved (DB/decrypt error)" from "disabled" ---
|
||||
|
||||
func TestChooseRestoreWebRTC_dbErrorMarksUnresolvedNotDisabled(t *testing.T) {
|
||||
// The bug-130 case: state file has no secret (freshly-joined node) and
|
||||
// the DB lookup ERRORS (e.g. the stored TURN secret can't be decrypted
|
||||
// after a cluster-secret rotation). This MUST surface as unresolved —
|
||||
// NOT as a clean "disabled" — so the caller preserves the running config
|
||||
// instead of writing a TURN-disabled gateway (which made turn.credentials
|
||||
// return namespace_not_configured).
|
||||
got := chooseRestoreWebRTC(false, 0, "", "", "", dbError)
|
||||
|
||||
if !got.unresolved {
|
||||
t.Fatal("BUG #130 REGRESSION: a DB/decrypt error must mark the result unresolved")
|
||||
}
|
||||
if got.enabled {
|
||||
t.Errorf("unresolved result must never be enabled (would write a config off an errored lookup); got %+v", got)
|
||||
}
|
||||
if got.turnSecret != "" {
|
||||
t.Errorf("unresolved result must carry no secret; got %q", got.turnSecret)
|
||||
}
|
||||
}
|
||||
|
||||
func TestChooseRestoreWebRTC_resolvedEmptyIsDisabledNotUnresolved(t *testing.T) {
|
||||
// The contrast case: the DB lookup COMPLETES and reports no WebRTC
|
||||
// (genuinely disabled namespace). This must be disabled, NOT unresolved —
|
||||
// the caller is free to write the empty/disabled config here.
|
||||
got := chooseRestoreWebRTC(false, 0, "", "", "", dbNone)
|
||||
|
||||
if got.unresolved {
|
||||
t.Error("a clean resolved-but-empty lookup must NOT be marked unresolved")
|
||||
}
|
||||
if got.enabled {
|
||||
t.Errorf("genuinely-disabled namespace must be disabled; got %+v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestChooseRestoreWebRTC_stateSecretWinsOverDBError(t *testing.T) {
|
||||
// A node that already holds the TURN secret in its state file must NOT be
|
||||
// affected by a DB error — it short-circuits before dbFetch and stays
|
||||
// enabled/resolved. Guards against the #130 fix accidentally disabling
|
||||
// healthy nodes when the DB is flaky.
|
||||
got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "",
|
||||
func() (string, string, string, int, bool) {
|
||||
t.Error("DB fetch must not be called when the state file has the secret")
|
||||
return dbError()
|
||||
})
|
||||
if got.unresolved || !got.enabled || got.turnSecret != "state-secret" {
|
||||
t.Errorf("state-file secret must win and stay enabled/resolved; got %+v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestChooseRestoreWebRTC_noStealthStaysEmpty(t *testing.T) {
|
||||
// Stealth disabled everywhere → empty stealthDomain (gateway advertises
|
||||
// the baseline 3-rung ladder only).
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user