From 4fc975216f334b1ad49518b6a707845e6bc3d1ce Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Sat, 30 May 2026 14:39:39 +0300 Subject: [PATCH] feat(gateway): fix WebRTC config persistence and endpoint access - Add internal WebRTC management endpoints to public path exemption list - Implement DB fallback for WebRTC configuration during cluster restore - Add unit tests to verify WebRTC config precedence and state self-healing --- core/pkg/gateway/middleware.go | 12 +++ core/pkg/gateway/middleware_test.go | 9 ++ core/pkg/namespace/cluster_manager.go | 88 ++++++++++++++++- core/pkg/namespace/restore_webrtc_test.go | 109 ++++++++++++++++++++++ 4 files changed, 213 insertions(+), 5 deletions(-) create mode 100644 core/pkg/namespace/restore_webrtc_test.go diff --git a/core/pkg/gateway/middleware.go b/core/pkg/gateway/middleware.go index 4ba9034..d02b9f2 100644 --- a/core/pkg/gateway/middleware.go +++ b/core/pkg/gateway/middleware.go @@ -660,6 +660,18 @@ func isPublicPath(p string) bool { return true } + // Namespace WebRTC management endpoints (enable/disable/status). Auth is + // handled INSIDE the handlers by the X-Orama-Internal-Auth header + + // WireGuard-peer source check (same as spawn/repair above). Without this + // exemption the API-key middleware rejects them with "missing API key" + // before the handler's internal-auth check runs, making the internal + // endpoints unreachable — so `orama namespace enable webrtc` had no + // working path (the public endpoint hits a gateway without the WebRTC + // manager wired). Bugboard: internal webrtc mgmt endpoints unreachable. + if strings.HasPrefix(p, "/v1/internal/namespace/webrtc/") { + return true + } + // Vault proxy endpoints (no auth — rate-limited per identity hash within handler) if strings.HasPrefix(p, "/v1/vault/") { return true diff --git a/core/pkg/gateway/middleware_test.go b/core/pkg/gateway/middleware_test.go index b5e38cb..01d610e 100644 --- a/core/pkg/gateway/middleware_test.go +++ b/core/pkg/gateway/middleware_test.go @@ -171,6 +171,15 @@ func TestIsPublicPath(t *testing.T) { {"internal join", "/v1/internal/join", true}, {"internal namespace spawn", "/v1/internal/namespace/spawn", true}, {"internal namespace repair", "/v1/internal/namespace/repair", true}, + // Internal WebRTC mgmt endpoints — exempt from API-key middleware + // (handler enforces internal-auth header + WireGuard peer). Without + // these, `orama namespace enable webrtc` had no working path. + {"internal webrtc enable", "/v1/internal/namespace/webrtc/enable", true}, + {"internal webrtc disable", "/v1/internal/namespace/webrtc/disable", true}, + {"internal webrtc status", "/v1/internal/namespace/webrtc/status", true}, + // Guard: the PUBLIC webrtc mgmt path must STILL require auth (only + // the /internal/ variant is exempt). + {"public webrtc enable still requires auth", "/v1/namespace/webrtc/enable", false}, {"phantom session", "/v1/auth/phantom/session", true}, {"phantom complete", "/v1/auth/phantom/complete", true}, diff --git a/core/pkg/namespace/cluster_manager.go b/core/pkg/namespace/cluster_manager.go index 1bb08a9..c4e5bc2 100644 --- a/core/pkg/namespace/cluster_manager.go +++ b/core/pkg/namespace/cluster_manager.go @@ -1815,6 +1815,52 @@ func (cm *ClusterManager) RestoreLocalClustersFromDisk(ctx context.Context) (int return restored, nil } +// restoreWebRTC is the resolved WebRTC gateway config for a restored +// namespace gateway. +type restoreWebRTC struct { + enabled bool + sfuPort int + turnDomain string + turnSecret string +} + +// chooseRestoreWebRTC decides the WebRTC fields for a restored namespace +// gateway. The local state file wins when it carries a complete WebRTC +// block; otherwise the DB (consulted lazily via dbFetch — only when the +// state file is incomplete) is the source of truth. Returns a disabled +// result when neither source has a usable block. +// +// Bugboard #25: namespaces that had WebRTC enabled AFTER their state file +// was written carry no SFU/TURN fields in state. Without the DB fallback, +// the from-disk restore regenerates the gateway config without the webrtc +// block on every restart — SFU/TURN keep running but the gateway loses +// turn_secret + sfu_port (credentials configured:false, routes 404). +// +// Extracted as a pure function so the precedence is unit-testable without +// standing up the full restore path (systemd spawner + DB + port store). +func chooseRestoreWebRTC( + stateHasSFU bool, stateSFUPort int, stateTURNDomain, stateTURNSecret string, + dbFetch func() (enabled bool, sfuPort int, turnDomain, turnSecret string), +) restoreWebRTC { + if stateHasSFU && stateSFUPort > 0 && stateTURNSecret != "" { + return restoreWebRTC{ + enabled: true, + sfuPort: stateSFUPort, + turnDomain: stateTURNDomain, + turnSecret: stateTURNSecret, + } + } + if enabled, sfuPort, turnDomain, turnSecret := dbFetch(); enabled && sfuPort > 0 && turnSecret != "" { + return restoreWebRTC{ + enabled: true, + sfuPort: sfuPort, + turnDomain: turnDomain, + turnSecret: turnSecret, + } + } + return restoreWebRTC{} +} + // restoreClusterFromState restores all processes for a cluster using local state (no DB queries). func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *ClusterLocalState) error { cm.logger.Info("Restoring namespace cluster from local state", @@ -1961,12 +2007,44 @@ func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *Cl IPFSReplicationFactor: cm.ipfsReplicationFactor, } - // Add WebRTC config from persisted local state - if state.HasSFU && state.SFUSignalingPort > 0 && state.TURNSharedSecret != "" { + // Resolve WebRTC config for the restored gateway. Prefer the + // local state file; fall back to the DB (source of truth) to + // self-heal stale state. Bugboard #25 — the state file is NOT + // updated by EnableWebRTC, so a namespace enabled AFTER its state + // file was written carries no SFU/TURN fields here. Because this + // from-disk restore runs BEFORE the DB-backed restore and + // succeeds, the gateway config would otherwise be regenerated + // WITHOUT the webrtc block on every restart — SFU/TURN services + // keep running but the gateway has empty turn_secret + sfu_port=0 + // (credentials return configured:false / 404, routes don't + // register). The lazy dbFetch only hits the DB when the state + // file is incomplete. + wr := chooseRestoreWebRTC( + state.HasSFU, state.SFUSignalingPort, state.TURNDomain, state.TURNSharedSecret, + func() (bool, int, string, string) { + webrtcCfg, err := cm.GetWebRTCConfig(ctx, state.NamespaceName) + if err != nil || webrtcCfg == nil { + return false, 0, "", "" + } + sfuBlock, err := cm.webrtcPortAllocator.GetSFUPorts(ctx, state.ClusterID, cm.localNodeID) + if err != nil || sfuBlock == nil { + return false, 0, "", "" + } + return true, sfuBlock.SFUSignalingPort, + fmt.Sprintf("turn.ns-%s.%s", state.NamespaceName, cm.baseDomain), + webrtcCfg.TURNSharedSecret + }, + ) + if wr.enabled { gwCfg.WebRTCEnabled = true - gwCfg.SFUPort = state.SFUSignalingPort - gwCfg.TURNDomain = state.TURNDomain - gwCfg.TURNSecret = state.TURNSharedSecret + gwCfg.SFUPort = wr.sfuPort + gwCfg.TURNDomain = wr.turnDomain + gwCfg.TURNSecret = wr.turnSecret + if !state.HasSFU { + cm.logger.Info("Re-materialized WebRTC gateway config from DB (state file was stale)", + zap.String("namespace", state.NamespaceName), + zap.Int("sfu_port", wr.sfuPort)) + } } if err := cm.spawnGatewayWithSystemd(ctx, gwCfg); err != nil { diff --git a/core/pkg/namespace/restore_webrtc_test.go b/core/pkg/namespace/restore_webrtc_test.go new file mode 100644 index 0000000..92703d2 --- /dev/null +++ b/core/pkg/namespace/restore_webrtc_test.go @@ -0,0 +1,109 @@ +package namespace + +import "testing" + +// Bugboard #25 — WebRTC config drift on restart. +// +// chooseRestoreWebRTC decides the gateway's WebRTC fields when a node +// restores namespace clusters from its local state file. The local state +// file is NOT updated by EnableWebRTC, so a namespace enabled after its +// state file was written has no SFU/TURN fields there — and because the +// from-disk restore runs first and succeeds, the DB-backed restore (which +// DOES read WebRTC) never runs. Result: the gateway config loses its +// webrtc block on every restart (SFU/TURN services keep running but the +// gateway reports configured:false and /v1/webrtc/turn/credentials 404s). +// +// These tests pin the precedence: state file when complete, DB fallback +// otherwise. The bug was the missing DB fallback. + +func dbDisabled() (bool, int, string, string) { return false, 0, "", "" } + +func dbEnabled(port int, domain, secret string) func() (bool, int, string, string) { + return func() (bool, int, string, string) { return true, port, domain, secret } +} + +func TestChooseRestoreWebRTC_stateFileCompleteWins(t *testing.T) { + // State file has a full block → use it, and NEVER consult the DB + // (the lazy dbFetch must not be called — saves a query on the hot + // restart path). + dbCalled := false + got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", + func() (bool, int, string, string) { dbCalled = true; return dbDisabled() }) + + if dbCalled { + t.Error("DB fetch was called even though the state file was complete (should short-circuit)") + } + if !got.enabled || got.sfuPort != 7800 || got.turnSecret != "state-secret" { + t.Errorf("want state-file values; got %+v", got) + } + if got.turnDomain != "turn.ns-x.dbrs.space" { + t.Errorf("turnDomain = %q; want state-file value", got.turnDomain) + } +} + +func TestChooseRestoreWebRTC_staleStateFallsBackToDB(t *testing.T) { + // The actual bug-25 case: state file has NO webrtc (stale — written + // before enable), but the DB says enabled. MUST fall back to the DB + // so the block re-materializes instead of being silently dropped. + got := chooseRestoreWebRTC(false, 0, "", "", + dbEnabled(7801, "turn.ns-anchat-test.dbrs.space", "db-secret")) + + if !got.enabled { + t.Fatal("BUG #25 REGRESSION: stale state file + DB-enabled WebRTC must fall back to DB; got disabled") + } + if got.sfuPort != 7801 { + t.Errorf("sfuPort = %d; want 7801 (from DB)", got.sfuPort) + } + if got.turnSecret != "db-secret" { + t.Errorf("turnSecret = %q; want db-secret (from DB)", got.turnSecret) + } + if got.turnDomain != "turn.ns-anchat-test.dbrs.space" { + t.Errorf("turnDomain = %q; want DB-derived value", got.turnDomain) + } +} + +func TestChooseRestoreWebRTC_bothEmptyDisabled(t *testing.T) { + // Namespace genuinely without WebRTC: state file empty, DB disabled. + // Must return disabled so we don't register broken webrtc routes. + got := chooseRestoreWebRTC(false, 0, "", "", dbDisabled) + if got.enabled { + t.Errorf("want disabled when neither source has WebRTC; got %+v", got) + } +} + +func TestChooseRestoreWebRTC_incompleteStateFileFallsToDB(t *testing.T) { + // State file partially populated (HasSFU but missing secret, or + // port 0) must NOT be treated as complete — fall through to DB. + // Catches a regression where a half-written state file shadows the + // DB and yields a broken (secret-less) gateway config. + cases := []struct { + name string + hasSFU bool + sfuPort int + turnSec string + }{ + {"hasSFU but port 0", true, 0, "s"}, + {"hasSFU but empty secret", true, 7800, ""}, + {"no hasSFU flag", false, 7800, "s"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := chooseRestoreWebRTC(tc.hasSFU, tc.sfuPort, "d", tc.turnSec, + dbEnabled(9000, "turn.db", "db-secret")) + if !got.enabled || got.sfuPort != 9000 || got.turnSecret != "db-secret" { + t.Errorf("incomplete state file should fall back to DB; got %+v", got) + } + }) + } +} + +func TestChooseRestoreWebRTC_dbIncompleteStaysDisabled(t *testing.T) { + // Defensive: if the DB row exists but is itself incomplete (no port + // or no secret — e.g. a half-provisioned enable), do NOT enable with + // a broken block. Better disabled than registering routes that 500. + got := chooseRestoreWebRTC(false, 0, "", "", + func() (bool, int, string, string) { return true, 0, "turn.db", "" }) + if got.enabled { + t.Errorf("DB row incomplete (port 0, no secret): want disabled; got %+v", got) + } +}