diff --git a/core/cmd/gateway/config.go b/core/cmd/gateway/config.go index 5810f50..720aa20 100644 --- a/core/cmd/gateway/config.go +++ b/core/cmd/gateway/config.go @@ -74,6 +74,10 @@ func parseGatewayConfig(logger *logging.ColoredLogger) *gateway.Config { SFUPort int `yaml:"sfu_port"` TURNDomain string `yaml:"turn_domain"` TURNSecret string `yaml:"turn_secret"` + // TURNStealthDomain is the neutral stealth TURNS:443 host (feat-124). + // Maps to cfg.StealthCDNDomain so turn.credentials advertises the + // stealth rung of the URI ladder. + TURNStealthDomain string `yaml:"turn_stealth_domain"` } type yamlCfg struct { @@ -256,6 +260,9 @@ func parseGatewayConfig(logger *logging.ColoredLogger) *gateway.Config { if v := strings.TrimSpace(y.WebRTC.TURNSecret); v != "" { cfg.TURNSecret = v } + if v := strings.TrimSpace(y.WebRTC.TURNStealthDomain); v != "" { + cfg.StealthCDNDomain = v + } // Validate configuration if errs := cfg.ValidateConfig(); len(errs) > 0 { diff --git a/core/cmd/sni-router/main.go b/core/cmd/sni-router/main.go index 32ae84b..e53e954 100644 --- a/core/cmd/sni-router/main.go +++ b/core/cmd/sni-router/main.go @@ -32,6 +32,18 @@ // backend: // name: gateway // addr: "127.0.0.1:8443" +// turn_discovery: +// namespaces_dir: /opt/orama/.orama/data/namespaces +// base_domain: orama-devnet.network +// rescan_interval: 30s +// +// When the turn_discovery.namespaces_dir is set, the router additionally scans +// /*/configs/turn-*.yaml every rescan_interval and derives two +// routes per namespace with a TURNS listener — the bland stealth host and a +// "turn.ns-." alias — both forwarding to that +// namespace's local TURNS port. Discovered routes are merged with the static +// routes above (static wins on conflict); a transient scan error keeps the +// previously-installed routes. package main import ( @@ -69,14 +81,29 @@ type yamlRoute struct { Backend yamlBackend `yaml:"backend"` } +// yamlTURNDiscovery mirrors sniproxy.TURNDiscoveryConfig for YAML decoding. +// When present and namespaces_dir is set, the router auto-discovers per- +// namespace stealth-TURN routes by scanning /*/configs/turn-*.yaml. +type yamlTURNDiscovery struct { + NamespacesDir string `yaml:"namespaces_dir"` + BaseDomain string `yaml:"base_domain"` + RescanInterval time.Duration `yaml:"rescan_interval"` +} + // yamlConfig is the on-disk configuration shape. type yamlConfig struct { - Listen string `yaml:"listen"` - ClientHelloTimeout time.Duration `yaml:"client_hello_timeout"` - BackendDialTimeout time.Duration `yaml:"backend_dial_timeout"` - MaxConcurrentConns int `yaml:"max_concurrent_conns"` - Fallback yamlBackend `yaml:"fallback"` - Routes []yamlRoute `yaml:"routes"` + Listen string `yaml:"listen"` + ClientHelloTimeout time.Duration `yaml:"client_hello_timeout"` + BackendDialTimeout time.Duration `yaml:"backend_dial_timeout"` + MaxConcurrentConns int `yaml:"max_concurrent_conns"` + Fallback yamlBackend `yaml:"fallback"` + Routes []yamlRoute `yaml:"routes"` + TURNDiscovery yamlTURNDiscovery `yaml:"turn_discovery"` +} + +// discoveryEnabled reports whether TURN route auto-discovery is configured. +func (y *yamlConfig) discoveryEnabled() bool { + return y.TURNDiscovery.NamespacesDir != "" } func main() { @@ -94,25 +121,49 @@ func main() { router := sniproxy.NewRouter(toBackend(cfg.Fallback)) - // Hot-reload the route table from the config file so a namespace's - // cdn/turn SNI routes can be added or removed without restarting the - // router (Router.Replace swaps atomically under in-flight connections). - reloader := sniproxy.NewFileRouteReloader(configPath, - func() ([]sniproxy.Route, sniproxy.Backend, error) { - y, err := loadConfig(configPath) - if err != nil { - return nil, sniproxy.Backend{}, err - } - return toRoutes(y.Routes), toBackend(y.Fallback), nil - }, router, logger.Logger) - if err := reloader.Apply(); err != nil { - logger.ComponentError(logging.ComponentSNI, "Failed to install initial routes", - zap.Error(err)) - os.Exit(1) + // The static routes (and fallback) always come from the config file; this + // closure is re-evaluated on every reload/rescan so a hand-edit to the + // config is picked up without a restart. + staticSource := func() ([]sniproxy.Route, sniproxy.Backend, error) { + y, err := loadConfig(configPath) + if err != nil { + return nil, sniproxy.Backend{}, err + } + return toRoutes(y.Routes), toBackend(y.Fallback), nil } + routeStop := make(chan struct{}) defer close(routeStop) - go reloader.Watch(sniproxy.DefaultRouteReloadInterval, routeStop) + + if cfg.discoveryEnabled() { + // Auto-discover per-namespace stealth-TURN routes by scanning the + // namespaces directory, merged with the static config routes (static + // wins on conflict), re-installed atomically every rescan_interval. A + // transient scan error keeps the previously-installed routes. + discoverer := sniproxy.NewTURNRouteDiscoverer( + sniproxy.TURNDiscoveryConfig{ + NamespacesDir: cfg.TURNDiscovery.NamespacesDir, + BaseDomain: cfg.TURNDiscovery.BaseDomain, + RescanInterval: cfg.TURNDiscovery.RescanInterval, + }, staticSource, router, logger.Logger) + if err := discoverer.Apply(); err != nil { + logger.ComponentError(logging.ComponentSNI, "Failed to install initial routes", + zap.Error(err)) + os.Exit(1) + } + go discoverer.Run(routeStop) + } else { + // No discovery configured: hot-reload the static route table from the + // config file so cdn/turn SNI routes can be added or removed without + // restarting (Router.Replace swaps atomically under in-flight conns). + reloader := sniproxy.NewFileRouteReloader(configPath, staticSource, router, logger.Logger) + if err := reloader.Apply(); err != nil { + logger.ComponentError(logging.ComponentSNI, "Failed to install initial routes", + zap.Error(err)) + os.Exit(1) + } + go reloader.Watch(sniproxy.DefaultRouteReloadInterval, routeStop) + } srv := sniproxy.NewServer(router, sniproxy.Config{ ClientHelloTimeout: cfg.ClientHelloTimeout, @@ -235,6 +286,16 @@ func validateConfig(y *yamlConfig) []string { errs = append(errs, fmt.Sprintf("routes[%d].backend.addr: required", i)) } } + // turn_discovery is optional, but when partially set (namespaces_dir XOR + // base_domain) it is almost certainly a misconfiguration, so validate the + // pair together via the library's own Validate. + if y.discoveryEnabled() || y.TURNDiscovery.BaseDomain != "" { + dc := sniproxy.TURNDiscoveryConfig{ + NamespacesDir: y.TURNDiscovery.NamespacesDir, + BaseDomain: y.TURNDiscovery.BaseDomain, + } + errs = append(errs, dc.Validate()...) + } return errs } diff --git a/core/docs/SERVERLESS.md b/core/docs/SERVERLESS.md index 195ba8d..78fea9e 100644 --- a/core/docs/SERVERLESS.md +++ b/core/docs/SERVERLESS.md @@ -187,6 +187,69 @@ The legacy `db_execute` is kept indefinitely so existing functions don't break. |----------|-------------| | `pubsub_publish(topic, dataJSON)` → bool | Publish message to a PubSub topic. Returns true on success. | +### Ephemeral State (WS-subscribe-tracked) + +Short-lived per-subscriber state (typing indicators, presence, call ringing, +live cursors) that the gateway **auto-clears the moment the owning WebSocket +client disconnects** — no heartbeats, no prune crons. State also expires on a +TTL backstop (default 60 s, max 30 min). The owning client ID and namespace +come from the server-trusted invocation context; functions cannot spoof them. + +| Function | Description | +|----------|-------------| +| `ephemeral_state_set(topic, key, payload, ttlMs)` → u32 | Record state owned by the CURRENT invocation's WS client and publish an `ephemeral.set` event on the topic. 1 = ok, 0 = failure (no WS client, empty topic/key, payload > 16 KiB, > 256 keys/client). | +| `ephemeral_state_clear(topic, key)` → u32 | Clear state this client owns; publishes `ephemeral.clear` (reason `explicit`). Idempotent — clearing a missing/non-owned key returns 1. | +| `ephemeral_state_list(topic)` → u64 | Reconnect catch-up read: packed `ptr<<32\|len` of a JSON envelope with the live entries on the topic. Works without a WS client (read-only). 0 on failure. | + +Raw import signatures (pointer/length ABI — note `ttlMs` is **i64**): + +```go +//go:wasmimport env ephemeral_state_set +func ephemeralStateSet(topicPtr *byte, topicLen uint32, keyPtr *byte, keyLen uint32, + payloadPtr *byte, payloadLen uint32, ttlMs int64) uint32 + +//go:wasmimport env ephemeral_state_clear +func ephemeralStateClear(topicPtr *byte, topicLen uint32, keyPtr *byte, keyLen uint32) uint32 + +//go:wasmimport env ephemeral_state_list +func ephemeralStateList(topicPtr *byte, topicLen uint32) uint64 // ptr<<32|len of JSON +``` + +Synthetic events are published **on the same topic** the state lives on, with +the `_orama` control-frame discriminator (same dispatch pattern as the +`auth.refresh` frame). Subscribers update their local view from the stream: + +```json +{"_orama":"ephemeral.set", "topic":"typing:room1", "key":"user-7", "client_id":"ws-abc", "payload":""} +{"_orama":"ephemeral.clear","topic":"typing:room1", "key":"user-7", "client_id":"ws-abc", "reason":"disconnect"} +``` + +`reason` is `explicit` (function called clear), `disconnect` (owning WS client +went away — the zero-lag path), or `expired` (TTL backstop). `payload` is +base64 (Go `[]byte` JSON encoding) and present only on `ephemeral.set`. + +`ephemeral_state_list` returns: + +```json +{"entries":[{"key":"user-7","client_id":"ws-abc","payload":"","expires_in_ms":48211}]} +``` + +Typing-indicator shape (called from a `ws_persistent` rpc-router function): + +```go +// Client sends {"op":"typing.start","room":"room1","user":"user-7"} → handler: +ephemeralStateSet(ptr("typing:"+room), len32("typing:"+room), + ptr(userID), len32(userID), nil, 0, 30_000) // 30s TTL backstop + +// Client sends typing.stop → handler: +ephemeralStateClear(ptr("typing:"+room), len32("typing:"+room), ptr(userID), len32(userID)) + +// No typing.stop needed on app kill / network drop: the WS disconnect publishes +// {"_orama":"ephemeral.clear",...,"reason":"disconnect"} to every subscriber +// immediately. On (re)connect, call ephemeral_state_list("typing:"+room) once +// to seed local state, then track the event stream. +``` + ### Logging | Function | Description | diff --git a/core/migrations/030_webrtc_stealth.sql b/core/migrations/030_webrtc_stealth.sql new file mode 100644 index 0000000..2b4c94e --- /dev/null +++ b/core/migrations/030_webrtc_stealth.sql @@ -0,0 +1,16 @@ +-- ============================================================================= +-- 030_webrtc_stealth.sql +-- +-- Stealth TURNS-over-443 per namespace — feat-124 (censorship-resistant +-- calling). When stealth_enabled is true the namespace's TURN servers carry a +-- second TLS certificate for the neutral stealth hostname +-- (cdn-., derived via turn.StealthHostForNamespace), the +-- SNI router forwards :443 ClientHellos for that hostname to the TURN TLS +-- listener, and turn.credentials advertises `turns::443` as the +-- final rung of the ICE URI ladder. +-- +-- Default false → backward compatible: existing WebRTC namespaces keep the +-- baseline udp:3478 / tcp:3478 / turns:5349 URIs unchanged. +-- ============================================================================= + +ALTER TABLE namespace_webrtc_config ADD COLUMN stealth_enabled BOOLEAN DEFAULT FALSE; diff --git a/core/pkg/cli/namespace_commands.go b/core/pkg/cli/namespace_commands.go index 6150406..6024f46 100644 --- a/core/pkg/cli/namespace_commands.go +++ b/core/pkg/cli/namespace_commands.go @@ -79,6 +79,8 @@ func showNamespaceHelp() { fmt.Printf(" repair - Repair an under-provisioned namespace cluster\n") fmt.Printf(" enable webrtc --namespace NS - Enable WebRTC (SFU + TURN) for a namespace\n") fmt.Printf(" disable webrtc --namespace NS - Disable WebRTC for a namespace\n") + fmt.Printf(" enable webrtc-stealth --namespace NS - Enable stealth TURNS over :443 (feat-124)\n") + fmt.Printf(" disable webrtc-stealth --namespace NS - Disable stealth TURNS\n") fmt.Printf(" webrtc-status --namespace NS - Show WebRTC service status\n") fmt.Printf(" help - Show this help message\n\n") fmt.Printf("Flags:\n") @@ -226,8 +228,12 @@ func handleNamespaceDelete(force bool) { func handleNamespaceEnable(args []string) { feature := args[0] + if feature == "webrtc-stealth" { + handleNamespaceStealthToggle(args[1:], true) + return + } if feature != "webrtc" { - fmt.Fprintf(os.Stderr, "Unknown feature: %s\nSupported features: webrtc\n", feature) + fmt.Fprintf(os.Stderr, "Unknown feature: %s\nSupported features: webrtc, webrtc-stealth\n", feature) os.Exit(1) } @@ -283,10 +289,82 @@ func handleNamespaceEnable(args []string) { fmt.Printf(" TURN instances: 2 nodes (relay on public IPs)\n") } +// handleNamespaceStealthToggle drives /v1/namespace/webrtc/stealth/{enable|disable} +// (feat-124 — censorship-resistant TURNS over :443). +func handleNamespaceStealthToggle(args []string, enable bool) { + verb := "disable" + if enable { + verb = "enable" + } + + var ns string + fs := flag.NewFlagSet("namespace "+verb+" webrtc-stealth", flag.ExitOnError) + fs.StringVar(&ns, "namespace", "", "Namespace name") + _ = fs.Parse(args) + + if ns == "" { + fmt.Fprintf(os.Stderr, "Usage: orama namespace %s webrtc-stealth --namespace \n", verb) + os.Exit(1) + } + + gatewayURL, apiKey := loadAuthForNamespace(ns) + + if enable { + fmt.Printf("Enabling WebRTC stealth (TURNS over :443) for namespace '%s'...\n", ns) + fmt.Printf("This provisions a Let's Encrypt cert for the neutral stealth host and may take up to ~2 minutes.\n") + } else { + fmt.Printf("Disabling WebRTC stealth for namespace '%s'...\n", ns) + } + + url := fmt.Sprintf("%s/v1/namespace/webrtc/stealth/%s", gatewayURL, verb) + req, err := http.NewRequest(http.MethodPost, url, nil) + if err != nil { + fmt.Fprintf(os.Stderr, "Failed to create request: %v\n", err) + os.Exit(1) + } + req.Header.Set("Authorization", "Bearer "+apiKey) + + client := &http.Client{ + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + }, + } + resp, err := client.Do(req) + if err != nil { + fmt.Fprintf(os.Stderr, "Failed to connect to gateway: %v\n", err) + os.Exit(1) + } + defer resp.Body.Close() + + var result map[string]interface{} + json.NewDecoder(resp.Body).Decode(&result) + + if resp.StatusCode != http.StatusOK { + errMsg := "unknown error" + if e, ok := result["error"].(string); ok { + errMsg = e + } + fmt.Fprintf(os.Stderr, "Failed to %s WebRTC stealth: %s\n", verb, errMsg) + os.Exit(1) + } + + if enable { + fmt.Printf("WebRTC stealth enabled for namespace '%s'.\n", ns) + fmt.Printf(" turn.credentials now advertises the full URI ladder including turns::443.\n") + fmt.Printf(" Make sure the SNI router is enabled on the TURN nodes (node.yaml sni_router.enabled).\n") + } else { + fmt.Printf("WebRTC stealth disabled for namespace '%s'.\n", ns) + } +} + func handleNamespaceDisable(args []string) { feature := args[0] + if feature == "webrtc-stealth" { + handleNamespaceStealthToggle(args[1:], false) + return + } if feature != "webrtc" { - fmt.Fprintf(os.Stderr, "Unknown feature: %s\nSupported features: webrtc\n", feature) + fmt.Fprintf(os.Stderr, "Unknown feature: %s\nSupported features: webrtc, webrtc-stealth\n", feature) os.Exit(1) } diff --git a/core/pkg/environments/production/config.go b/core/pkg/environments/production/config.go index 9100f94..ae27b2a 100644 --- a/core/pkg/environments/production/config.go +++ b/core/pkg/environments/production/config.go @@ -230,9 +230,54 @@ func (cg *ConfigGenerator) GenerateNodeConfig(peerAddresses []string, vpsIP stri return "", fmt.Errorf("failed to populate webrtc config: %w", err) } + // Stealth TURN SNI router (feat-124). Like the webrtc block, sni_router is + // an operator opt-in that only exists in the previous node.yaml, so carry + // it forward across regeneration. Without this, a Phase4 regen would reset + // sni_router.enabled to false, stop the :443 router and break stealth TURN + // for every region that relies on it (the same regen-wipe class of outage + // as bugboard #259/#846). + cg.populateSNIRouterConfig(&data) + return templates.RenderNodeConfig(data) } +// populateSNIRouterConfig carries forward the operator-set sni_router.enabled +// flag from the existing node.yaml so a config regeneration never silently +// disables the stealth TURN-over-443 router. Absence of the file or block +// leaves the flag at its default (false). +func (cg *ConfigGenerator) populateSNIRouterConfig(data *templates.NodeConfigData) { + data.SNIRouterEnabled = cg.readExistingSNIRouterEnabled() +} + +// SNIRouterEnabled reports whether the node's on-disk node.yaml has opted in to +// the stealth TURN-over-443 SNI router. The orchestrator reads this AFTER +// Phase4 has written node.yaml to decide whether to move Caddy to :8443 and +// start the router unit. Returns false when the config or block is absent. +func (cg *ConfigGenerator) SNIRouterEnabled() bool { + return cg.readExistingSNIRouterEnabled() +} + +// readExistingSNIRouterEnabled parses just the top-level sni_router.enabled +// flag out of the existing node.yaml. Returns false when the file is missing, +// malformed, or has no sni_router block (fresh install / not opted in). +func (cg *ConfigGenerator) readExistingSNIRouterEnabled() bool { + configPath := filepath.Join(cg.oramaDir, "configs", "node.yaml") + raw, err := os.ReadFile(configPath) + if err != nil { + return false // No existing config (fresh install) — default off. + } + + var parsed struct { + SNIRouter struct { + Enabled bool `yaml:"enabled"` + } `yaml:"sni_router"` + } + if err := yaml.Unmarshal(raw, &parsed); err != nil { + return false // Malformed/old config — don't fail regen; default off. + } + return parsed.SNIRouter.Enabled +} + // existingWebRTC is the minimal shape parsed out of an existing node.yaml to // carry forward operator-set WebRTC fields across a config regeneration. type existingWebRTC struct { diff --git a/core/pkg/environments/production/installers.go b/core/pkg/environments/production/installers.go index c96134f..8c31e35 100644 --- a/core/pkg/environments/production/installers.go +++ b/core/pkg/environments/production/installers.go @@ -23,7 +23,8 @@ type BinaryInstaller struct { gateway *installers.GatewayInstaller coredns *installers.CoreDNSInstaller caddy *installers.CaddyInstaller - ntfy *installers.NtfyInstaller // feature #72; installed only when EnableNtfy is set + ntfy *installers.NtfyInstaller // feature #72; installed only when EnableNtfy is set + sniRouter *installers.SNIRouterInstaller // feat-124; configured only when sni_router.enabled } // NewBinaryInstaller creates a new binary installer @@ -41,6 +42,7 @@ func NewBinaryInstaller(arch string, logWriter io.Writer) *BinaryInstaller { coredns: installers.NewCoreDNSInstaller(arch, logWriter, oramaHome), caddy: installers.NewCaddyInstaller(arch, logWriter, oramaHome), ntfy: installers.NewNtfyInstaller(arch, logWriter), + sniRouter: installers.NewSNIRouterInstaller(arch, logWriter, OramaDir), } } @@ -158,6 +160,29 @@ func (bi *BinaryInstaller) EnableCaddyNtfyProxy(hostname string) { bi.caddy.EnableNtfyProxy(hostname) } +// EnableCaddySNIRouterMode moves Caddy's HTTPS listener off :443 to :8443 on +// the next ConfigureCaddy() call, freeing :443 for the orama-sni-router +// (feat-124). Must be called BEFORE ConfigureCaddy. +func (bi *BinaryInstaller) EnableCaddySNIRouterMode() { + bi.caddy.EnableSNIRouterMode() +} + +// ConfigureSNIRouter writes the orama-sni-router YAML config (listen :443, +// fallback Caddy on :8443, turn_discovery for baseDomain). Feat-124. +func (bi *BinaryInstaller) ConfigureSNIRouter(baseDomain string) error { + return bi.sniRouter.Configure(baseDomain) +} + +// WriteSNIRouterUnit writes /etc/systemd/system/orama-sni-router.service. +func (bi *BinaryInstaller) WriteSNIRouterUnit() error { + return bi.sniRouter.WriteSystemdUnit() +} + +// SNIRouterServiceName returns the systemd unit name for lifecycle calls. +func (bi *BinaryInstaller) SNIRouterServiceName() string { + return installers.SNIRouterServiceName +} + // InstallNtfy installs the self-hosted ntfy server (binary, user, // systemd unit, data directory). Feature #72. Idempotent. func (bi *BinaryInstaller) InstallNtfy() error { diff --git a/core/pkg/environments/production/installers/caddy.go b/core/pkg/environments/production/installers/caddy.go index f339758..9ce4a50 100644 --- a/core/pkg/environments/production/installers/caddy.go +++ b/core/pkg/environments/production/installers/caddy.go @@ -27,8 +27,20 @@ type CaddyInstaller struct { // Enabled per-node via EnableNtfyProxy. Feature #72. withNtfy bool ntfyHostname string // e.g. "push.dbrs.space" — fully-qualified public host + + // behindSNIRouter, when set, moves Caddy's HTTPS listener off :443 to + // CaddyHTTPSPortBehindSNI so the orama-sni-router can own :443 and forward + // TLS by SNI (feat-124, stealth TURN). Enabled per-node via + // EnableSNIRouterMode. Plain HTTP (:80) is unaffected. When false the + // generated Caddyfile is byte-identical to the pre-feature output. + behindSNIRouter bool } +// CaddyHTTPSPortBehindSNI is the port Caddy binds for HTTPS when the node runs +// behind the SNI router (which owns :443). 8443 matches the sni-router config's +// caddy fallback backend (127.0.0.1:8443) and the plan doc. +const CaddyHTTPSPortBehindSNI = 8443 + // NewCaddyInstaller creates a new Caddy installer func NewCaddyInstaller(arch string, logWriter io.Writer, oramaHome string) *CaddyInstaller { return &CaddyInstaller{ @@ -52,6 +64,16 @@ func (ci *CaddyInstaller) EnableNtfyProxy(hostname string) { ci.ntfyHostname = hostname } +// EnableSNIRouterMode tells the Caddy installer to bind HTTPS on +// CaddyHTTPSPortBehindSNI (8443) instead of :443, freeing :443 for the +// orama-sni-router (feat-124). Plain HTTP on :80 is left untouched. Must be +// called BEFORE Configure so the generated Caddyfile picks up the global +// `https_port` option. A no-op when never called: the default Caddyfile keeps +// HTTPS on :443. +func (ci *CaddyInstaller) EnableSNIRouterMode() { + ci.behindSNIRouter = true +} + // IsInstalled checks if Caddy with orama DNS module is already installed func (ci *CaddyInstaller) IsInstalled() bool { caddyPath := "/usr/bin/caddy" @@ -417,7 +439,17 @@ func (ci *CaddyInstaller) generateCaddyfile(domain, email, acmeEndpoint, baseDom // workload is REST + WebSocket (neither benefits much from // h2 stream multiplexing — REST is keep-alive over h1, and // WS is single-connection by design). - sb.WriteString(fmt.Sprintf("{\n email %s\n servers {\n protocols h1\n }\n}\n", email)) + // When this node runs behind the SNI router (feat-124), move Caddy's HTTPS + // listener off :443 to CaddyHTTPSPortBehindSNI via the `https_port` global + // option. The sni-router owns :443 and forwards TLS by SNI to either a + // namespace's TURNS listener or here (127.0.0.1:8443). Plain HTTP (:80) is + // unchanged. When behindSNIRouter is false, no `https_port` line is emitted + // and the Caddyfile is byte-identical to the pre-feature output. + httpsPortOption := "" + if ci.behindSNIRouter { + httpsPortOption = fmt.Sprintf(" https_port %d\n", CaddyHTTPSPortBehindSNI) + } + sb.WriteString(fmt.Sprintf("{\n email %s\n%s servers {\n protocols h1\n }\n}\n", email, httpsPortOption)) // Node domain blocks (e.g., node1.dbrs.space, *.node1.dbrs.space) sb.WriteString(fmt.Sprintf("\n*.%s {\n%s\n reverse_proxy localhost:6001\n}\n", domain, tlsBlock)) diff --git a/core/pkg/environments/production/installers/caddy_test.go b/core/pkg/environments/production/installers/caddy_test.go index 31598c5..b0b21c0 100644 --- a/core/pkg/environments/production/installers/caddy_test.go +++ b/core/pkg/environments/production/installers/caddy_test.go @@ -1,6 +1,7 @@ package installers import ( + "fmt" "io" "strings" "testing" @@ -97,3 +98,50 @@ func TestGenerateCaddyfile_BaseDomainSameAsDomainOmitsDuplicates(t *testing.T) { t.Errorf("expected exactly 2 `*.dbrs.space {` occurrences (1 TLS + 1 HTTP), got %d in:\n%s", got, cf) } } + +// TestGenerateCaddyfile_SNIRouterDisabledByteIdentical is the safety guard for +// feat-124: when EnableSNIRouterMode has NOT been called, the generated +// Caddyfile must be byte-identical to the pre-feature output (HTTPS stays on +// :443, no `https_port` global option). This is the default for every existing +// node — any drift here is a silent production change. +func TestGenerateCaddyfile_SNIRouterDisabledByteIdentical(t *testing.T) { + ci := newTestCaddyInstaller() + cf := ci.generateCaddyfile("node1.dbrs.space", "admin@dbrs.space", + "http://localhost:6001/v1/internal/acme", "dbrs.space") + + if strings.Contains(cf, "https_port") { + t.Errorf("default Caddyfile must NOT contain `https_port` (SNI router off); got:\n%s", cf) + } + if strings.Contains(cf, "8443") { + t.Errorf("default Caddyfile must NOT reference :8443 (SNI router off); got:\n%s", cf) + } + // The global options block must be exactly the pre-feature shape. + if !strings.Contains(cf, "{\n email admin@dbrs.space\n servers {\n protocols h1\n }\n}\n") { + t.Errorf("default global options block drifted from pre-feature output; got:\n%s", cf) + } +} + +// TestGenerateCaddyfile_SNIRouterEnabledMovesHTTPSTo8443 verifies that after +// EnableSNIRouterMode, Caddy's HTTPS listener is moved to :8443 via the +// `https_port` global option, while plain HTTP (:80) is unchanged so ACME +// HTTP-01 and the HTTP catch-all still work. +func TestGenerateCaddyfile_SNIRouterEnabledMovesHTTPSTo8443(t *testing.T) { + ci := newTestCaddyInstaller() + ci.EnableSNIRouterMode() + cf := ci.generateCaddyfile("node1.dbrs.space", "admin@dbrs.space", + "http://localhost:6001/v1/internal/acme", "dbrs.space") + + want := fmt.Sprintf("https_port %d", CaddyHTTPSPortBehindSNI) + if !strings.Contains(cf, want) { + t.Errorf("SNI-router Caddyfile must contain %q; got:\n%s", want, cf) + } + // The global option belongs inside the top-level options block, before the + // servers stanza. + if !strings.Contains(cf, "{\n email admin@dbrs.space\n https_port 8443\n servers {\n protocols h1\n }\n}\n") { + t.Errorf("https_port not placed correctly in global options block; got:\n%s", cf) + } + // Plain HTTP :80 catch-all must be unchanged. + if !strings.Contains(cf, ":80 {") { + t.Errorf("HTTP :80 block must remain when SNI router enabled; got:\n%s", cf) + } +} diff --git a/core/pkg/environments/production/installers/sni_router.go b/core/pkg/environments/production/installers/sni_router.go new file mode 100644 index 0000000..5a2706e --- /dev/null +++ b/core/pkg/environments/production/installers/sni_router.go @@ -0,0 +1,203 @@ +package installers + +import ( + "fmt" + "io" + "os" + "path/filepath" +) + +// SNI router installer (feat-124, stealth TURN-over-443). +// +// Unlike the binary installers (Caddy, ntfy), the orama-sni-router binary is +// built and shipped to the node by `orama build` / the install tarball — this +// installer only writes the router's YAML config and the systemd unit, and +// drives the unit's lifecycle (install+enable+start when enabled, +// stop+disable when not). + +const ( + // SNIRouterListenAddr is the public port the router binds. It owns :443 so + // Caddy is moved to CaddyHTTPSPortBehindSNI (see caddy.go). + SNIRouterListenAddr = ":443" + + // SNIRouterServiceName is the systemd unit name. + SNIRouterServiceName = "orama-sni-router.service" + + // SNIRouterConfigName is the router config filename (resolved under + // /configs by the binary's config.DefaultPath lookup). + SNIRouterConfigName = "sni-router.yaml" + + // sniRouterRescanInterval is how often the router rescans the namespaces + // directory for per-namespace TURNS listeners. Matches the library default + // (sniproxy.DefaultDiscoveryRescanInterval); kept as a literal here to avoid + // importing the runtime package into the installer. + sniRouterRescanInterval = "30s" + + // sniRouterClientHelloTimeout / sniRouterBackendDialTimeout bound the + // per-connection ClientHello peek and backend dial (slowloris / dead-backend + // protection). Mirror the sniproxy server defaults. + sniRouterClientHelloTimeout = "5s" + sniRouterBackendDialTimeout = "5s" + + // sniRouterMaxConcurrentConns caps in-flight connections on the public + // :443 listener (DoS guard); mirrors the sniproxy server default. + sniRouterMaxConcurrentConns = 10000 + + // sniRouterSystemdUnitPath is where the unit file is written. + sniRouterSystemdUnitPath = "/etc/systemd/system/" + SNIRouterServiceName + + // sniRouterBinaryPath is the installed binary path on the node. + sniRouterBinaryPath = "/opt/orama/bin/orama-sni-router" +) + +// SNIRouterInstaller writes the orama-sni-router config + systemd unit and +// manages the unit lifecycle. The caddy fallback port matches +// CaddyHTTPSPortBehindSNI so unmatched SNIs (regular HTTPS) reach the moved +// Caddy listener. +type SNIRouterInstaller struct { + *BaseInstaller + oramaDir string // e.g. "/opt/orama/.orama" +} + +// NewSNIRouterInstaller creates an installer. oramaDir is the node's .orama +// data root (where configs/ and data/namespaces live). +func NewSNIRouterInstaller(arch string, logWriter io.Writer, oramaDir string) *SNIRouterInstaller { + return &SNIRouterInstaller{ + BaseInstaller: NewBaseInstaller(arch, logWriter), + oramaDir: oramaDir, + } +} + +// configPath returns the absolute path the router config is written to and the +// binary resolves to via its DefaultPath lookup (/configs/). +func (si *SNIRouterInstaller) configPath() string { + return filepath.Join(si.oramaDir, "configs", SNIRouterConfigName) +} + +// namespacesDir returns the per-namespace config root the router scans for +// TURNS listeners. +func (si *SNIRouterInstaller) namespacesDir() string { + return filepath.Join(si.oramaDir, "data", "namespaces") +} + +// Configure writes the router YAML config. baseDomain drives the stealth and +// "turn.ns-*" SNI hostnames the router derives during discovery. Idempotent. +func (si *SNIRouterInstaller) Configure(baseDomain string) error { + if baseDomain == "" { + return fmt.Errorf("sni-router: base domain must not be empty") + } + + configDir := filepath.Dir(si.configPath()) + if err := os.MkdirAll(configDir, 0755); err != nil { + return fmt.Errorf("sni-router: create config dir %s: %w", configDir, err) + } + + content := si.generateConfig(baseDomain) + if err := os.WriteFile(si.configPath(), []byte(content), 0644); err != nil { + return fmt.Errorf("sni-router: write config %s: %w", si.configPath(), err) + } + return nil +} + +// generateConfig renders the sni-router.yaml. The fallback is Caddy on +// CaddyHTTPSPortBehindSNI; turn_discovery scans the node's namespaces dir so +// per-namespace TURNS routes appear without a router restart. No static routes +// are emitted — every TURNS route is auto-discovered. +func (si *SNIRouterInstaller) generateConfig(baseDomain string) string { + return fmt.Sprintf(`# Orama SNI router config (feat-124, stealth TURN-over-443). +# Generated by the installer — re-running install/upgrade overwrites this file. +# +# The router owns :443, peeks each connection's TLS ClientHello SNI, and +# forwards the raw (still-encrypted) stream to a backend. TLS is NOT terminated +# here. Unmatched SNIs (regular HTTPS) go to the fallback (Caddy on :%[2]d). +listen: "%[1]s" +client_hello_timeout: %[3]s +backend_dial_timeout: %[4]s +max_concurrent_conns: %[5]d + +fallback: + name: caddy + addr: "127.0.0.1:%[2]d" + +# Per-namespace stealth-TURN routes are auto-discovered by scanning +# /*/configs/turn-*.yaml every rescan_interval. Each namespace +# with a TURNS listener gets two routes (the bland stealth host and a +# turn.ns-. alias) forwarding to its local TURNS port. +turn_discovery: + namespaces_dir: %[6]q + base_domain: %[7]q + rescan_interval: %[8]s + +# No static routes: every TURNS route comes from turn_discovery above. +routes: [] +`, + SNIRouterListenAddr, + CaddyHTTPSPortBehindSNI, + sniRouterClientHelloTimeout, + sniRouterBackendDialTimeout, + sniRouterMaxConcurrentConns, + si.namespacesDir(), + baseDomain, + sniRouterRescanInterval, + ) +} + +// generateSystemdUnit renders /etc/systemd/system/orama-sni-router.service. +// Runs as the orama user with CAP_NET_BIND_SERVICE so it can bind :443 without +// root. Ordered Before=caddy.service so the router is ready before Caddy +// switches to :8443. Restart=on-failure. +func (si *SNIRouterInstaller) generateSystemdUnit() string { + return fmt.Sprintf(`[Unit] +Description=Orama SNI Router (TLS-level :443 → backend forwarder) +Documentation=https://github.com/DeBrosOfficial/network +After=network.target +Before=caddy.service +PartOf=orama-node.service + +[Service] +Type=simple +WorkingDirectory=/opt/orama +EnvironmentFile=-/opt/orama/.orama/data/sni-router.env +ExecStart=%s --config %s + +# Bind privileged ports (:80, :443) without running as root. +AmbientCapabilities=CAP_NET_BIND_SERVICE +CapabilityBoundingSet=CAP_NET_BIND_SERVICE + +User=orama +Group=orama +NoNewPrivileges=yes +ProtectSystem=strict +ProtectHome=yes +PrivateTmp=yes +LimitNOFILE=65536 + +TimeoutStopSec=15s +KillMode=mixed +KillSignal=SIGTERM + +Restart=on-failure +RestartSec=5s + +StandardOutput=journal +StandardError=journal +SyslogIdentifier=orama-sni-router + +[Install] +WantedBy=multi-user.target +`, sniRouterBinaryPath, si.configPath()) +} + +// WriteSystemdUnit writes the unit file. Idempotent. +func (si *SNIRouterInstaller) WriteSystemdUnit() error { + if err := os.WriteFile(sniRouterSystemdUnitPath, []byte(si.generateSystemdUnit()), 0644); err != nil { + return fmt.Errorf("sni-router: write systemd unit %s: %w", sniRouterSystemdUnitPath, err) + } + return nil +} + +// IsInstalled reports whether the router binary is present on the node. +func (si *SNIRouterInstaller) IsInstalled() bool { + _, err := os.Stat(sniRouterBinaryPath) + return err == nil +} diff --git a/core/pkg/environments/production/installers/sni_router_test.go b/core/pkg/environments/production/installers/sni_router_test.go new file mode 100644 index 0000000..dbcf4e6 --- /dev/null +++ b/core/pkg/environments/production/installers/sni_router_test.go @@ -0,0 +1,102 @@ +package installers + +import ( + "io" + "os" + "path/filepath" + "strings" + "testing" +) + +// newTestSNIRouterInstaller returns an installer rooted at a temp oramaDir so +// Configure writes to an isolated location. +func newTestSNIRouterInstaller(oramaDir string) *SNIRouterInstaller { + return NewSNIRouterInstaller("amd64", io.Discard, oramaDir) +} + +// TestGenerateConfig_includesDiscoveryAndFallback verifies the rendered +// sni-router.yaml binds :443, falls back to Caddy on the moved HTTPS port, and +// emits a turn_discovery block pointing at the node's namespaces dir + base +// domain. +func TestGenerateConfig_includesDiscoveryAndFallback(t *testing.T) { + dir := t.TempDir() + si := newTestSNIRouterInstaller(dir) + + cfg := si.generateConfig("orama-devnet.network") + + for _, want := range []string{ + `listen: ":443"`, + "fallback:", + `addr: "127.0.0.1:8443"`, + "turn_discovery:", + "base_domain: \"orama-devnet.network\"", + "rescan_interval: 30s", + "routes: []", + } { + if !strings.Contains(cfg, want) { + t.Errorf("generated sni-router config missing %q\n---\n%s", want, cfg) + } + } + + // namespaces_dir must be the node's data/namespaces path. + wantNS := filepath.Join(dir, "data", "namespaces") + if !strings.Contains(cfg, wantNS) { + t.Errorf("config missing namespaces_dir %q\n---\n%s", wantNS, cfg) + } +} + +// TestConfigure_writesFileToConfigsDir verifies Configure persists the YAML to +// /configs/sni-router.yaml. +func TestConfigure_writesFileToConfigsDir(t *testing.T) { + dir := t.TempDir() + si := newTestSNIRouterInstaller(dir) + + if err := si.Configure("example.com"); err != nil { + t.Fatalf("Configure failed: %v", err) + } + + path := filepath.Join(dir, "configs", "sni-router.yaml") + data, err := os.ReadFile(path) + if err != nil { + t.Fatalf("expected config at %s: %v", path, err) + } + if !strings.Contains(string(data), "base_domain: \"example.com\"") { + t.Errorf("written config missing base_domain; got:\n%s", string(data)) + } +} + +// TestConfigure_rejectsEmptyBaseDomain verifies the installer refuses an empty +// base domain rather than emitting a config that would derive bogus hostnames. +func TestConfigure_rejectsEmptyBaseDomain(t *testing.T) { + si := newTestSNIRouterInstaller(t.TempDir()) + if err := si.Configure(""); err == nil { + t.Errorf("expected error for empty base domain") + } +} + +// TestGenerateSystemdUnit_shape verifies the unit grants CAP_NET_BIND_SERVICE, +// runs as orama, restarts on failure, and points ExecStart at the installed +// binary + config. +func TestGenerateSystemdUnit_shape(t *testing.T) { + dir := t.TempDir() + si := newTestSNIRouterInstaller(dir) + unit := si.generateSystemdUnit() + + for _, want := range []string{ + "AmbientCapabilities=CAP_NET_BIND_SERVICE", + "User=orama", + "Restart=on-failure", + "EnvironmentFile=-/opt/orama/.orama/data/sni-router.env", + // ExecStart must point at the ABSOLUTE config path so it doesn't + // depend on WorkingDirectory/$HOME resolution at runtime. + "ExecStart=/opt/orama/bin/orama-sni-router --config " + si.configPath(), + "Before=caddy.service", + } { + if !strings.Contains(unit, want) { + t.Errorf("systemd unit missing %q\n---\n%s", want, unit) + } + } + if !strings.Contains(si.configPath(), dir) { + t.Errorf("configPath %q not rooted at the oramaDir %q", si.configPath(), dir) + } +} diff --git a/core/pkg/environments/production/orchestrator.go b/core/pkg/environments/production/orchestrator.go index b910f30..676d65c 100644 --- a/core/pkg/environments/production/orchestrator.go +++ b/core/pkg/environments/production/orchestrator.go @@ -741,11 +741,35 @@ func (ps *ProductionSetup) Phase4GenerateConfigs(peerAddresses []string, vpsIP s ps.logf(" ✓ ntfy config generated (base_url: %s)", ntfyBaseURL) } + // Stealth TURN-over-443 (feat-124): when the node opted in + // (sni_router.enabled in the node.yaml just written above), Caddy + // must vacate :443 so the orama-sni-router can own it. Move Caddy's + // HTTPS listener to :8443 BEFORE ConfigureCaddy renders the Caddyfile. + // When not opted in, the Caddyfile is byte-identical to before. + if ps.configGenerator.SNIRouterEnabled() { + ps.binaryInstaller.EnableCaddySNIRouterMode() + ps.logf(" ✓ SNI router enabled — Caddy HTTPS will bind :8443") + } + if err := ps.binaryInstaller.ConfigureCaddy(caddyDomain, email, acmeEndpoint, baseDomain); err != nil { ps.logf(" ⚠️ Caddy config warning: %v", err) } else { ps.logf(" ✓ Caddy config generated") } + + // Stealth TURN-over-443 (feat-124): when opted in, write the + // orama-sni-router config (listen :443, fallback Caddy :8443, + // turn_discovery scanning this node's namespaces dir for the cluster's + // base domain). The unit lifecycle is driven in Phase5 after Caddy has + // moved to :8443. The router uses the base domain as the zone for + // stealth/turn.ns-* hostnames. + if ps.configGenerator.SNIRouterEnabled() { + if err := ps.binaryInstaller.ConfigureSNIRouter(dnsZone); err != nil { + ps.logf(" ⚠️ SNI router config warning: %v", err) + } else { + ps.logf(" ✓ SNI router config generated (zone: %s)", dnsZone) + } + } } return nil @@ -871,6 +895,14 @@ func (ps *ProductionSetup) Phase5CreateSystemdServices(enableHTTPS bool) error { } } + // SNI router unit (feat-124). Write the unit whenever the binary is present + // so the daemon-reload below picks it up; the enable/start vs stop/disable + // decision (based on sni_router.enabled) happens after Caddy has moved to + // :8443, in the start section. + if ps.binaryInstaller.WriteSNIRouterUnit() == nil { + ps.logf(" ✓ SNI router service unit created: %s", ps.binaryInstaller.SNIRouterServiceName()) + } + // Reload systemd daemon if err := ps.serviceController.DaemonReload(); err != nil { return fmt.Errorf("failed to reload systemd: %w", err) @@ -980,6 +1012,31 @@ func (ps *ProductionSetup) Phase5CreateSystemdServices(enableHTTPS bool) error { } } + // Stealth TURN-over-443 (feat-124) cutover. Caddy has just been + // reconfigured to :8443 and restarted above, so :443 is now free for the + // SNI router. When opted in, enable+start the router; when not, stop+disable + // it so a node that flipped the flag off cleanly returns :443 to Caddy. + sniSvc := ps.binaryInstaller.SNIRouterServiceName() + if ps.configGenerator.SNIRouterEnabled() { + if err := ps.serviceController.EnableService(sniSvc); err != nil { + ps.logf(" ⚠️ Failed to enable %s: %v", sniSvc, err) + } + if err := ps.serviceController.RestartService(sniSvc); err != nil { + ps.logf(" ⚠️ Failed to start %s: %v", sniSvc, err) + } else { + ps.logf(" - %s started (owns :443)", sniSvc) + } + } else { + // Not opted in: ensure the router is not holding :443. Errors are + // non-fatal — the unit may simply not be loaded on this node. + if err := ps.serviceController.StopService(sniSvc); err != nil { + ps.logf(" ℹ️ %s not running (expected when disabled): %v", sniSvc, err) + } + if err := ps.serviceController.DisableService(sniSvc); err != nil { + ps.logf(" ℹ️ %s not enabled (expected when disabled): %v", sniSvc, err) + } + } + // Start ntfy on every node (#72). Caddy must already be up (it // terminates TLS for push.), which the order above // guarantees. diff --git a/core/pkg/environments/production/sni_router_test.go b/core/pkg/environments/production/sni_router_test.go new file mode 100644 index 0000000..2c2d730 --- /dev/null +++ b/core/pkg/environments/production/sni_router_test.go @@ -0,0 +1,72 @@ +package production + +import ( + "strings" + "testing" +) + +// TestGenerateNodeConfig_preservesSNIRouterEnabled is the regression test for +// the feat-124 regen-wipe class of outage (cf. bugboard #259/#846 for webrtc): +// a config regeneration must NOT silently reset an operator's +// sni_router.enabled: true back to false, which would stop the :443 router and +// break stealth TURN. We write a node.yaml with the flag set, regenerate, and +// assert it survives. +func TestGenerateNodeConfig_preservesSNIRouterEnabled(t *testing.T) { + dir := t.TempDir() + writeNodeYAML(t, dir, `sni_router: + enabled: true + +http_gateway: + enabled: true +`) + + cg := NewConfigGenerator(dir) + out, err := cg.GenerateNodeConfig(nil, "10.0.0.5", "", "node-1.dbrs.space", "dbrs.space", false) + if err != nil { + t.Fatalf("GenerateNodeConfig failed: %v", err) + } + + if !strings.Contains(out, "sni_router:") { + t.Fatalf("regenerated node.yaml missing sni_router block\n---\n%s", out) + } + if !strings.Contains(out, "enabled: true") { + t.Errorf("regenerated node.yaml did not preserve sni_router.enabled: true\n---\n%s", out) + } +} + +// TestGenerateNodeConfig_sniRouterDefaultsFalse verifies a fresh install (no +// existing node.yaml) renders sni_router.enabled: false — default OFF. +func TestGenerateNodeConfig_sniRouterDefaultsFalse(t *testing.T) { + dir := t.TempDir() + cg := NewConfigGenerator(dir) + + out, err := cg.GenerateNodeConfig(nil, "10.0.0.5", "", "node-1.dbrs.space", "dbrs.space", false) + if err != nil { + t.Fatalf("GenerateNodeConfig failed: %v", err) + } + if !strings.Contains(out, "sni_router:") { + t.Fatalf("node.yaml missing sni_router block\n---\n%s", out) + } + if !strings.Contains(out, "enabled: false") { + t.Errorf("fresh node.yaml should render sni_router.enabled: false\n---\n%s", out) + } + if cg.SNIRouterEnabled() { + t.Errorf("SNIRouterEnabled() should be false on a fresh install") + } +} + +// TestGenerateNodeConfig_sniRouterDisabledStaysFalse verifies an existing +// node.yaml that explicitly disabled the router does not flip on during regen. +func TestGenerateNodeConfig_sniRouterDisabledStaysFalse(t *testing.T) { + dir := t.TempDir() + writeNodeYAML(t, dir, "sni_router:\n enabled: false\nhttp_gateway:\n enabled: true\n") + + cg := NewConfigGenerator(dir) + out, err := cg.GenerateNodeConfig(nil, "10.0.0.5", "", "node-1.dbrs.space", "dbrs.space", false) + if err != nil { + t.Fatalf("GenerateNodeConfig failed: %v", err) + } + if !strings.Contains(out, "enabled: false") { + t.Errorf("disabled sni_router should stay false on regen\n---\n%s", out) + } +} diff --git a/core/pkg/environments/templates/node.yaml b/core/pkg/environments/templates/node.yaml index 740d66d..552e766 100644 --- a/core/pkg/environments/templates/node.yaml +++ b/core/pkg/environments/templates/node.yaml @@ -15,6 +15,14 @@ node: operator_wallet: "{{.OperatorWallet}}" {{- end}} +# Stealth TURN-over-443 SNI router (feat-124). When enabled, the node runs +# orama-sni-router on :443 and Caddy is moved to :8443; default-OFF so existing +# nodes are byte-identical until an operator opts in. This block is preserved +# across config regeneration (GenerateNodeConfig carries forward an existing +# sni_router.enabled: true). +sni_router: + enabled: {{if .SNIRouterEnabled}}true{{else}}false{{end}} + database: data_dir: "{{.DataDir}}/rqlite" replication_factor: 3 diff --git a/core/pkg/environments/templates/render.go b/core/pkg/environments/templates/render.go index 27581f1..222f858 100644 --- a/core/pkg/environments/templates/render.go +++ b/core/pkg/environments/templates/render.go @@ -66,6 +66,16 @@ type NodeConfigData struct { SFUPort int // Local SFU signaling port the gateway proxies to TURNDomain string // TURN domain (e.g., "turn.ns-myapp.dbrs.space") TURNSecret string // HMAC-SHA1 shared secret for TURN credential generation + + // SNIRouterEnabled gates the stealth TURN-over-443 SNI router (feat-124). + // Rendered as the top-level sni_router.enabled flag. Default false keeps + // existing nodes byte-identical (Caddy stays on :443); when true the node + // runs orama-sni-router on :443 and Caddy moves to :8443. This value is + // carried forward across config regeneration from the existing node.yaml + // (see production/config.go populateSNIRouterConfig) so a regen never wipes + // an operator's opt-in (the same preserve-from-existing discipline as the + // webrtc block, bugboard #259/#846). + SNIRouterEnabled bool } // GatewayConfigData holds parameters for gateway.yaml rendering diff --git a/core/pkg/environments/templates/render_test.go b/core/pkg/environments/templates/render_test.go index 847f536..99f4f75 100644 --- a/core/pkg/environments/templates/render_test.go +++ b/core/pkg/environments/templates/render_test.go @@ -103,6 +103,36 @@ func TestRenderNodeConfig_webRTC(t *testing.T) { } } +func TestRenderNodeConfig_sniRouter(t *testing.T) { + // Enabled: top-level sni_router block renders enabled: true. + enabled, err := RenderNodeConfig(NodeConfigData{ + NodeID: "node1", + SNIRouterEnabled: true, + }) + if err != nil { + t.Fatalf("RenderNodeConfig failed: %v", err) + } + if !strings.Contains(enabled, "sni_router:") { + t.Errorf("rendered node config missing sni_router block\n---\n%s", enabled) + } + if !strings.Contains(enabled, "enabled: true") { + t.Errorf("sni_router should render enabled: true\n---\n%s", enabled) + } + + // Default: the block is always present, defaulting to false (so the flag is + // discoverable to operators and round-trips through regen). + disabled, err := RenderNodeConfig(NodeConfigData{NodeID: "node1"}) + if err != nil { + t.Fatalf("RenderNodeConfig failed: %v", err) + } + if !strings.Contains(disabled, "sni_router:") { + t.Errorf("sni_router block should always be present\n---\n%s", disabled) + } + if !strings.Contains(disabled, "enabled: false") { + t.Errorf("default sni_router should render enabled: false\n---\n%s", disabled) + } +} + func TestRenderGatewayConfig(t *testing.T) { bootstrapMultiaddr := "/ip4/127.0.0.1/tcp/4001/p2p/Qm1234567890" data := GatewayConfigData{ diff --git a/core/pkg/gateway/gateway.go b/core/pkg/gateway/gateway.go index 1b880c8..2ffe518 100644 --- a/core/pkg/gateway/gateway.go +++ b/core/pkg/gateway/gateway.go @@ -1114,6 +1114,48 @@ func (g *Gateway) namespaceWebRTCDisablePublicHandler(w http.ResponseWriter, r * }) } +// namespaceWebRTCStealthPublicHandler handles POST /v1/namespace/webrtc/stealth/{enable|disable} +// (feat-124). Public: authenticated by JWT/API key via auth middleware; +// namespace from context. `enable` is true for the enable route. +func (g *Gateway) namespaceWebRTCStealthPublicHandler(w http.ResponseWriter, r *http.Request, enable bool) { + if r.Method != http.MethodPost { + writeError(w, http.StatusMethodNotAllowed, "method not allowed") + return + } + + namespaceName, _ := r.Context().Value(CtxKeyNamespaceOverride).(string) + if namespaceName == "" { + writeError(w, http.StatusForbidden, "namespace not resolved") + return + } + + if g.webrtcManager == nil { + writeError(w, http.StatusServiceUnavailable, "WebRTC management not enabled") + return + } + + var err error + action := "disabled" + if enable { + action = "enabled" + err = g.webrtcManager.EnableWebRTCStealth(r.Context(), namespaceName) + } else { + err = g.webrtcManager.DisableWebRTCStealth(r.Context(), namespaceName) + } + if err != nil { + writeError(w, http.StatusInternalServerError, err.Error()) + return + } + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(map[string]interface{}{ + "status": "ok", + "namespace": namespaceName, + "message": "WebRTC stealth " + action + " successfully", + }) +} + // namespaceWebRTCStatusPublicHandler handles GET /v1/namespace/webrtc/status // Public: authenticated by JWT/API key via auth middleware. Namespace from context. func (g *Gateway) namespaceWebRTCStatusPublicHandler(w http.ResponseWriter, r *http.Request) { diff --git a/core/pkg/gateway/handlers/auth/handlers.go b/core/pkg/gateway/handlers/auth/handlers.go index eb08721..b68f749 100644 --- a/core/pkg/gateway/handlers/auth/handlers.go +++ b/core/pkg/gateway/handlers/auth/handlers.go @@ -64,6 +64,12 @@ type WebRTCManager interface { DisableWebRTC(ctx context.Context, namespaceName string) error // GetWebRTCStatus returns the WebRTC config for a namespace, or nil if not enabled. GetWebRTCStatus(ctx context.Context, namespaceName string) (interface{}, error) + // EnableWebRTCStealth / DisableWebRTCStealth toggle the censorship- + // resistant TURNS:443 path (feat-124): stealth cert on the TURN servers, + // stealth DNS records, and the turns::443 rung in the + // turn.credentials URI ladder. Requires WebRTC to already be enabled. + EnableWebRTCStealth(ctx context.Context, namespaceName string) error + DisableWebRTCStealth(ctx context.Context, namespaceName string) error } // Handlers holds dependencies for authentication HTTP handlers diff --git a/core/pkg/gateway/handlers/namespace/spawn_handler.go b/core/pkg/gateway/handlers/namespace/spawn_handler.go index 1758b50..8c4860d 100644 --- a/core/pkg/gateway/handlers/namespace/spawn_handler.go +++ b/core/pkg/gateway/handlers/namespace/spawn_handler.go @@ -53,6 +53,8 @@ type SpawnRequest struct { GatewaySFUPort int `json:"gateway_sfu_port,omitempty"` GatewayTURNDomain string `json:"gateway_turn_domain,omitempty"` GatewayTURNSecret string `json:"gateway_turn_secret,omitempty"` + // Stealth TURNS:443 host (feat-124); empty when stealth is disabled. + GatewayTURNStealthDomain string `json:"gateway_turn_stealth_domain,omitempty"` // Host serverless secrets encryption key forwarded to the spawned // namespace gateway (bugboard #837 follow-up). Same value on every node. GatewaySecretsEncryptionKey string `json:"gateway_secrets_encryption_key,omitempty"` @@ -67,14 +69,15 @@ type SpawnRequest struct { RQLiteDSN string `json:"rqlite_dsn,omitempty"` // TURN config (when action = "spawn-turn") - TURNListenAddr string `json:"turn_listen_addr,omitempty"` - TURNTURNSAddr string `json:"turn_turns_addr,omitempty"` - TURNPublicIP string `json:"turn_public_ip,omitempty"` - TURNRealm string `json:"turn_realm,omitempty"` - TURNAuthSecret string `json:"turn_auth_secret,omitempty"` - TURNRelayStart int `json:"turn_relay_start,omitempty"` - TURNRelayEnd int `json:"turn_relay_end,omitempty"` - TURNDomain string `json:"turn_domain,omitempty"` + TURNListenAddr string `json:"turn_listen_addr,omitempty"` + TURNTURNSAddr string `json:"turn_turns_addr,omitempty"` + TURNPublicIP string `json:"turn_public_ip,omitempty"` + TURNRealm string `json:"turn_realm,omitempty"` + TURNAuthSecret string `json:"turn_auth_secret,omitempty"` + TURNRelayStart int `json:"turn_relay_start,omitempty"` + TURNRelayEnd int `json:"turn_relay_end,omitempty"` + TURNDomain string `json:"turn_domain,omitempty"` + TURNStealthDomain string `json:"turn_stealth_domain,omitempty"` // Cluster state (when action = "save-cluster-state") ClusterState json.RawMessage `json:"cluster_state,omitempty"` @@ -237,6 +240,7 @@ func (h *SpawnHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { WebRTCEnabled: req.GatewayWebRTCEnabled, SFUPort: req.GatewaySFUPort, TURNDomain: req.GatewayTURNDomain, + TURNStealthDomain: req.GatewayTURNStealthDomain, TURNSecret: req.GatewayTURNSecret, SecretsEncryptionKey: req.GatewaySecretsEncryptionKey, } @@ -291,6 +295,7 @@ func (h *SpawnHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { WebRTCEnabled: req.GatewayWebRTCEnabled, SFUPort: req.GatewaySFUPort, TURNDomain: req.GatewayTURNDomain, + TURNStealthDomain: req.GatewayTURNStealthDomain, TURNSecret: req.GatewayTURNSecret, SecretsEncryptionKey: req.GatewaySecretsEncryptionKey, } @@ -360,6 +365,7 @@ func (h *SpawnHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { RelayPortStart: req.TURNRelayStart, RelayPortEnd: req.TURNRelayEnd, TURNDomain: req.TURNDomain, + StealthDomain: req.TURNStealthDomain, } if err := h.systemdSpawner.SpawnTURN(ctx, req.Namespace, req.NodeID, cfg); err != nil { h.logger.Error("Failed to spawn TURN instance", zap.Error(err)) diff --git a/core/pkg/gateway/instance_spawner.go b/core/pkg/gateway/instance_spawner.go index 3b21f32..41deafd 100644 --- a/core/pkg/gateway/instance_spawner.go +++ b/core/pkg/gateway/instance_spawner.go @@ -95,6 +95,11 @@ type InstanceConfig struct { SFUPort int // SFU signaling port on this node TURNDomain string // TURN server domain (e.g., "turn.ns-alice.orama-devnet.network") TURNSecret string // TURN shared secret for credential generation + // TURNStealthDomain is the neutral stealth TURNS host (feat-124, + // cdn-.). Non-empty only when webrtc stealth is + // enabled for the namespace; turn.credentials then advertises + // `turns::443` as the final URI-ladder rung. + TURNStealthDomain string // SecretsEncryptionKey is the host-wide AES-256 serverless secrets // encryption key (hex-encoded). Bugboard #837 follow-up: the host gateway // receives this via gateway.Config but spawned namespace gateways never @@ -109,10 +114,11 @@ type InstanceConfig struct { // GatewayYAMLWebRTC represents the webrtc section of the gateway YAML config. // Must match yamlWebRTCCfg in cmd/gateway/config.go. type GatewayYAMLWebRTC struct { - Enabled bool `yaml:"enabled"` - SFUPort int `yaml:"sfu_port,omitempty"` - TURNDomain string `yaml:"turn_domain,omitempty"` - TURNSecret string `yaml:"turn_secret,omitempty"` + Enabled bool `yaml:"enabled"` + SFUPort int `yaml:"sfu_port,omitempty"` + TURNDomain string `yaml:"turn_domain,omitempty"` + TURNSecret string `yaml:"turn_secret,omitempty"` + TURNStealthDomain string `yaml:"turn_stealth_domain,omitempty"` } // GatewayYAMLConfig represents the gateway YAML configuration structure @@ -334,10 +340,11 @@ func (is *InstanceSpawner) generateConfig(configPath string, cfg InstanceConfig, IPFSAPIURL: cfg.IPFSAPIURL, IPFSReplicationFactor: cfg.IPFSReplicationFactor, WebRTC: GatewayYAMLWebRTC{ - Enabled: cfg.WebRTCEnabled, - SFUPort: cfg.SFUPort, - TURNDomain: cfg.TURNDomain, - TURNSecret: cfg.TURNSecret, + Enabled: cfg.WebRTCEnabled, + SFUPort: cfg.SFUPort, + TURNDomain: cfg.TURNDomain, + TURNSecret: cfg.TURNSecret, + TURNStealthDomain: cfg.TURNStealthDomain, }, SecretsEncryptionKey: cfg.SecretsEncryptionKey, } diff --git a/core/pkg/gateway/routes.go b/core/pkg/gateway/routes.go index ee3c189..7ac4b9e 100644 --- a/core/pkg/gateway/routes.go +++ b/core/pkg/gateway/routes.go @@ -67,6 +67,12 @@ func (g *Gateway) Routes() http.Handler { // Namespace WebRTC enable/disable/status (public, JWT/API key auth via middleware) mux.HandleFunc("/v1/namespace/webrtc/enable", g.namespaceWebRTCEnablePublicHandler) mux.HandleFunc("/v1/namespace/webrtc/disable", g.namespaceWebRTCDisablePublicHandler) + mux.HandleFunc("/v1/namespace/webrtc/stealth/enable", func(w http.ResponseWriter, r *http.Request) { + g.namespaceWebRTCStealthPublicHandler(w, r, true) + }) + mux.HandleFunc("/v1/namespace/webrtc/stealth/disable", func(w http.ResponseWriter, r *http.Request) { + g.namespaceWebRTCStealthPublicHandler(w, r, false) + }) mux.HandleFunc("/v1/namespace/webrtc/status", g.namespaceWebRTCStatusPublicHandler) // auth endpoints diff --git a/core/pkg/namespace/cluster_manager.go b/core/pkg/namespace/cluster_manager.go index 4f687ce..905e8d9 100644 --- a/core/pkg/namespace/cluster_manager.go +++ b/core/pkg/namespace/cluster_manager.go @@ -678,23 +678,24 @@ func (cm *ClusterManager) spawnGatewayRemote(ctx context.Context, nodeIP string, } resp, err := cm.sendSpawnRequest(ctx, nodeIP, map[string]interface{}{ - "action": "spawn-gateway", - "namespace": cfg.Namespace, - "node_id": cfg.NodeID, - "gateway_http_port": cfg.HTTPPort, - "gateway_base_domain": cfg.BaseDomain, - "gateway_rqlite_dsn": cfg.RQLiteDSN, - "gateway_global_rqlite_dsn": cfg.GlobalRQLiteDSN, - "gateway_olric_servers": cfg.OlricServers, - "gateway_olric_timeout": olricTimeout, - "ipfs_cluster_api_url": cfg.IPFSClusterAPIURL, - "ipfs_api_url": cfg.IPFSAPIURL, - "ipfs_timeout": ipfsTimeout, - "ipfs_replication_factor": cfg.IPFSReplicationFactor, - "gateway_webrtc_enabled": cfg.WebRTCEnabled, - "gateway_sfu_port": cfg.SFUPort, - "gateway_turn_domain": cfg.TURNDomain, - "gateway_turn_secret": cfg.TURNSecret, + "action": "spawn-gateway", + "namespace": cfg.Namespace, + "node_id": cfg.NodeID, + "gateway_http_port": cfg.HTTPPort, + "gateway_base_domain": cfg.BaseDomain, + "gateway_rqlite_dsn": cfg.RQLiteDSN, + "gateway_global_rqlite_dsn": cfg.GlobalRQLiteDSN, + "gateway_olric_servers": cfg.OlricServers, + "gateway_olric_timeout": olricTimeout, + "ipfs_cluster_api_url": cfg.IPFSClusterAPIURL, + "ipfs_api_url": cfg.IPFSAPIURL, + "ipfs_timeout": ipfsTimeout, + "ipfs_replication_factor": cfg.IPFSReplicationFactor, + "gateway_webrtc_enabled": cfg.WebRTCEnabled, + "gateway_sfu_port": cfg.SFUPort, + "gateway_turn_domain": cfg.TURNDomain, + "gateway_turn_secret": cfg.TURNSecret, + "gateway_turn_stealth_domain": cfg.TURNStealthDomain, // Bugboard #837 follow-up: carry the host secrets encryption key to // the remote node so its spawned namespace gateway can manage secrets. "gateway_secrets_encryption_key": cfg.SecretsEncryptionKey, @@ -1614,6 +1615,7 @@ func (cm *ClusterManager) restoreClusterOnNode(ctx context.Context, clusterID, n gwCfg.SFUPort = sfuBlock.SFUSignalingPort gwCfg.TURNDomain = fmt.Sprintf("turn.ns-%s.%s", namespaceName, cm.baseDomain) gwCfg.TURNSecret = webrtcCfg.TURNSharedSecret + gwCfg.TURNStealthDomain = cm.stealthDomainFor(namespaceName, webrtcCfg) } } @@ -1679,8 +1681,9 @@ type ClusterLocalState struct { // WebRTC fields (zero values when WebRTC not enabled — backward compatible) HasSFU bool `json:"has_sfu,omitempty"` HasTURN bool `json:"has_turn,omitempty"` - TURNSharedSecret string `json:"turn_shared_secret,omitempty"` // Needed for gateway to generate TURN credentials on cold start - TURNDomain string `json:"turn_domain,omitempty"` // TURN server domain for gateway config + TURNSharedSecret string `json:"turn_shared_secret,omitempty"` // Needed for gateway to generate TURN credentials on cold start + TURNDomain string `json:"turn_domain,omitempty"` // TURN server domain for gateway config + TURNStealthDomain string `json:"turn_stealth_domain,omitempty"` // Stealth TURNS:443 host (feat-124); empty when stealth disabled TURNCredentialTTL int `json:"turn_credential_ttl,omitempty"` SFUSignalingPort int `json:"sfu_signaling_port,omitempty"` SFUMediaPortStart int `json:"sfu_media_port_start,omitempty"` @@ -1836,10 +1839,11 @@ func (cm *ClusterManager) RestoreLocalClustersFromDisk(ctx context.Context) (int // restoreWebRTC is the resolved WebRTC gateway config for a restored // namespace gateway. type restoreWebRTC struct { - enabled bool - sfuPort int - turnDomain string - turnSecret string + enabled bool + sfuPort int + turnDomain string + turnSecret string + stealthDomain string // feat-124: empty when webrtc stealth is disabled } // chooseRestoreWebRTC resolves a restored gateway's WebRTC config. TWO @@ -1864,11 +1868,12 @@ type restoreWebRTC struct { // Extracted as a pure function so the precedence is unit-testable without // standing up the full restore path (systemd spawner + DB + port store). func chooseRestoreWebRTC( - stateHasSFU bool, stateSFUPort int, stateTURNDomain, stateTURNSecret string, - dbFetch func() (turnSecret, turnDomain string, sfuPort int), + stateHasSFU bool, stateSFUPort int, stateTURNDomain, stateTURNSecret, stateStealthDomain string, + dbFetch func() (turnSecret, turnDomain, stealthDomain string, sfuPort int), ) restoreWebRTC { turnSecret := stateTURNSecret turnDomain := stateTURNDomain + stealthDomain := stateStealthDomain sfuPort := 0 if stateHasSFU && stateSFUPort > 0 { sfuPort = stateSFUPort @@ -1878,12 +1883,17 @@ func chooseRestoreWebRTC( // the marker that the namespace has WebRTC enabled at all. The state // file is not updated by EnableWebRTC, so a namespace enabled after // the state file was written reaches here with an empty secret. + // (Stealth toggles DO rewrite cluster state on every node, so the + // state-first read stays fresh for stealthDomain too.) if turnSecret == "" { - if dbSecret, dbDomain, dbSFU := dbFetch(); dbSecret != "" { + if dbSecret, dbDomain, dbStealth, dbSFU := dbFetch(); dbSecret != "" { turnSecret = dbSecret if turnDomain == "" { turnDomain = dbDomain } + if stealthDomain == "" { + stealthDomain = dbStealth + } if sfuPort == 0 { sfuPort = dbSFU } @@ -1891,10 +1901,11 @@ func chooseRestoreWebRTC( } return restoreWebRTC{ - enabled: turnSecret != "" || sfuPort > 0, - sfuPort: sfuPort, - turnDomain: turnDomain, - turnSecret: turnSecret, + enabled: turnSecret != "" || sfuPort > 0, + sfuPort: sfuPort, + turnDomain: turnDomain, + turnSecret: turnSecret, + stealthDomain: stealthDomain, } } @@ -2050,11 +2061,11 @@ func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *Cl // fields here. The lazy dbFetch only hits the DB when the state // file is incomplete. wr := chooseRestoreWebRTC( - state.HasSFU, state.SFUSignalingPort, state.TURNDomain, state.TURNSharedSecret, - func() (turnSecret, turnDomain string, sfuPort int) { + state.HasSFU, state.SFUSignalingPort, state.TURNDomain, state.TURNSharedSecret, state.TURNStealthDomain, + func() (turnSecret, turnDomain, stealthDomain string, sfuPort int) { webrtcCfg, err := cm.GetWebRTCConfig(ctx, state.NamespaceName) if err != nil || webrtcCfg == nil { - return "", "", 0 + return "", "", "", 0 } // TURN is namespace-wide; SFU port is per-node and may be // absent on a gateway-only (non-SFU) node — that's fine, @@ -2065,6 +2076,7 @@ func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *Cl } return webrtcCfg.TURNSharedSecret, fmt.Sprintf("turn.ns-%s.%s", state.NamespaceName, cm.baseDomain), + cm.stealthDomainFor(state.NamespaceName, webrtcCfg), sfu }, ) @@ -2076,6 +2088,7 @@ func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *Cl gwCfg.SFUPort = wr.sfuPort gwCfg.TURNDomain = wr.turnDomain gwCfg.TURNSecret = wr.turnSecret + gwCfg.TURNStealthDomain = wr.stealthDomain } resp, err := http.Get(fmt.Sprintf("http://localhost:%d/v1/health", pb.GatewayHTTPPort)) @@ -2126,6 +2139,7 @@ func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *Cl RelayPortStart: state.TURNRelayPortStart, RelayPortEnd: state.TURNRelayPortEnd, TURNDomain: fmt.Sprintf("turn.ns-%s.%s", state.NamespaceName, cm.baseDomain), + StealthDomain: cm.stealthDomainFor(state.NamespaceName, webrtcCfg), } if err := cm.systemdSpawner.SpawnTURN(ctx, state.NamespaceName, cm.localNodeID, turnCfg); err != nil { cm.logger.Error("Failed to restore TURN from state", zap.String("namespace", state.NamespaceName), zap.Error(err)) diff --git a/core/pkg/namespace/cluster_manager_stealth.go b/core/pkg/namespace/cluster_manager_stealth.go new file mode 100644 index 0000000..cfab07b --- /dev/null +++ b/core/pkg/namespace/cluster_manager_stealth.go @@ -0,0 +1,263 @@ +package namespace + +import ( + "context" + "fmt" + + "github.com/DeBrosOfficial/network/pkg/client" + "github.com/DeBrosOfficial/network/pkg/turn" + "go.uber.org/zap" +) + +// Stealth TURNS-over-443 lifecycle (feat-124, censorship-resistant calling). +// +// Enabling stealth for a namespace whose WebRTC is already running: +// 1. creates DNS A records for the neutral stealth host -> the TURN nodes, +// 2. flips namespace_webrtc_config.stealth_enabled, +// 3. re-spawns the namespace's TURN servers with the stealth domain (the +// spawner provisions a Let's Encrypt cert for it — hard-fail, never +// self-signed), +// 4. rewrites cluster-state.json on every node (so DB-less restores keep +// the stealth domain), and +// 5. restarts the namespace gateways so turn.credentials advertises +// `turns::443` as the final URI-ladder rung. +// +// The SNI router on :443 discovers the route (stealth host -> local TURN TLS +// port) from the TURN config files on disk — no extra registration step. + +// stealthDomainFor returns the namespace's stealth TURNS host when stealth is +// enabled in its WebRTC config, else "" (callers treat empty as disabled). +func (cm *ClusterManager) stealthDomainFor(namespaceName string, webrtcCfg *WebRTCConfig) string { + if webrtcCfg == nil || !webrtcCfg.StealthEnabled { + return "" + } + return turn.StealthHostForNamespace(namespaceName, cm.baseDomain) +} + +// EnableWebRTCStealth enables the stealth TURNS:443 path for a namespace. +// Requires WebRTC to already be enabled. +func (cm *ClusterManager) EnableWebRTCStealth(ctx context.Context, namespaceName string) error { + cluster, webrtcCfg, err := cm.getStealthPrereqs(ctx, namespaceName) + if err != nil { + return err + } + if webrtcCfg.StealthEnabled { + return ErrWebRTCStealthAlreadyEnabled + } + + stealthDomain := turn.StealthHostForNamespace(namespaceName, cm.baseDomain) + cm.logger.Info("Enabling WebRTC stealth for namespace", + zap.String("namespace", namespaceName), + zap.String("stealth_domain", stealthDomain)) + + clusterNodes, err := cm.getClusterNodesWithIPs(ctx, cluster.ID) + if err != nil { + return fmt.Errorf("failed to get cluster nodes: %w", err) + } + turnBlocks, err := cm.getWebRTCBlocksByType(ctx, cluster.ID, "turn") + if err != nil { + return fmt.Errorf("failed to get TURN allocations for namespace %s: %w", namespaceName, err) + } + if len(turnBlocks) == 0 { + return fmt.Errorf("no TURN allocations found for namespace %s (is WebRTC fully enabled?)", namespaceName) + } + + // DNS first — cert provisioning and clients both need the name to resolve. + var turnIPs []string + for _, block := range turnBlocks { + for _, n := range clusterNodes { + if n.NodeID == block.NodeID { + turnIPs = append(turnIPs, n.PublicIP) + } + } + } + if err := cm.dnsManager.CreateStealthTURNRecords(ctx, namespaceName, stealthDomain, turnIPs); err != nil { + return fmt.Errorf("failed to create stealth DNS records: %w", err) + } + + if err := cm.setStealthEnabled(ctx, cluster.ID, true); err != nil { + return err + } + + // Re-spawn TURN with the stealth domain; roll back on failure so the + // board never claims a stealth endpoint that doesn't terminate TLS. + if err := cm.respawnTURNWithStealth(ctx, cluster, clusterNodes, turnBlocks, webrtcCfg.TURNSharedSecret, stealthDomain); err != nil { + cm.rollbackStealthEnable(ctx, cluster.ID, namespaceName) + return fmt.Errorf("failed to re-spawn TURN with stealth cert (stealth rolled back): %w", err) + } + + cm.refreshStateAndGateways(ctx, cluster, clusterNodes, stealthDomain, webrtcCfg.TURNSharedSecret) + cm.logEvent(ctx, cluster.ID, EventWebRTCEnabled, "", + fmt.Sprintf("WebRTC stealth enabled (%s)", stealthDomain), nil) + return nil +} + +// DisableWebRTCStealth turns the stealth TURNS:443 path off again. TURN and +// the baseline ladder (udp/tcp 3478, turns:5349) keep running. +func (cm *ClusterManager) DisableWebRTCStealth(ctx context.Context, namespaceName string) error { + cluster, webrtcCfg, err := cm.getStealthPrereqs(ctx, namespaceName) + if err != nil { + return err + } + if !webrtcCfg.StealthEnabled { + return ErrWebRTCStealthNotEnabled + } + + cm.logger.Info("Disabling WebRTC stealth for namespace", zap.String("namespace", namespaceName)) + + clusterNodes, err := cm.getClusterNodesWithIPs(ctx, cluster.ID) + if err != nil { + return fmt.Errorf("failed to get cluster nodes: %w", err) + } + turnBlocks, err := cm.getWebRTCBlocksByType(ctx, cluster.ID, "turn") + if err != nil { + return fmt.Errorf("failed to get TURN allocations: %w", err) + } + + if err := cm.setStealthEnabled(ctx, cluster.ID, false); err != nil { + return err + } + if err := cm.respawnTURNWithStealth(ctx, cluster, clusterNodes, turnBlocks, webrtcCfg.TURNSharedSecret, ""); err != nil { + return fmt.Errorf("failed to re-spawn TURN without stealth: %w", err) + } + if err := cm.dnsManager.DeleteStealthTURNRecords(ctx, namespaceName); err != nil { + cm.logger.Warn("Failed to delete stealth DNS records", zap.Error(err)) + } + cm.refreshStateAndGateways(ctx, cluster, clusterNodes, "", webrtcCfg.TURNSharedSecret) + cm.logEvent(ctx, cluster.ID, EventWebRTCDisabled, "", "WebRTC stealth disabled", nil) + return nil +} + +// getStealthPrereqs validates the cluster exists and WebRTC is enabled, +// returning both records (with the TURN secret already decrypted). +func (cm *ClusterManager) getStealthPrereqs(ctx context.Context, namespaceName string) (*NamespaceCluster, *WebRTCConfig, error) { + cluster, err := cm.GetClusterByNamespace(ctx, namespaceName) + if err != nil { + return nil, nil, fmt.Errorf("failed to get cluster: %w", err) + } + if cluster == nil { + return nil, nil, ErrClusterNotFound + } + webrtcCfg, err := cm.GetWebRTCConfig(ctx, namespaceName) + if err != nil { + return nil, nil, fmt.Errorf("failed to get WebRTC config: %w", err) + } + if webrtcCfg == nil { + return nil, nil, ErrWebRTCNotEnabled + } + return cluster, webrtcCfg, nil +} + +// setStealthEnabled flips the stealth flag in namespace_webrtc_config. +func (cm *ClusterManager) setStealthEnabled(ctx context.Context, clusterID string, enabled bool) error { + internalCtx := client.WithInternalAuth(ctx) + val := 0 + if enabled { + val = 1 + } + if _, err := cm.db.Exec(internalCtx, + `UPDATE namespace_webrtc_config SET stealth_enabled = ? WHERE namespace_cluster_id = ? AND enabled = 1`, + val, clusterID); err != nil { + return fmt.Errorf("failed to update stealth_enabled: %w", err) + } + return nil +} + +// respawnTURNWithStealth stops and re-spawns every TURN instance of the +// cluster with the given stealth domain ("" = stealth off). The spawner +// provisions the stealth cert and writes the new TURN config; the SNI +// router's discovery picks the route change up from disk. +func (cm *ClusterManager) respawnTURNWithStealth( + ctx context.Context, + cluster *NamespaceCluster, + clusterNodes []clusterNodeInfo, + turnBlocks []WebRTCPortBlock, + turnSecret, stealthDomain string, +) error { + turnDomain := fmt.Sprintf("turn.ns-%s.%s", cluster.NamespaceName, cm.baseDomain) + for _, block := range turnBlocks { + var node *clusterNodeInfo + for i := range clusterNodes { + if clusterNodes[i].NodeID == block.NodeID { + node = &clusterNodes[i] + break + } + } + if node == nil { + return fmt.Errorf("TURN node %s not found in cluster nodes", block.NodeID) + } + + cm.stopTURNOnNode(ctx, node.NodeID, node.InternalIP, cluster.NamespaceName) + turnCfg := TURNInstanceConfig{ + Namespace: cluster.NamespaceName, + NodeID: node.NodeID, + ListenAddr: fmt.Sprintf("0.0.0.0:%d", block.TURNListenPort), + TURNSListenAddr: fmt.Sprintf("0.0.0.0:%d", block.TURNTLSPort), + PublicIP: node.PublicIP, + Realm: cm.baseDomain, + AuthSecret: turnSecret, + RelayPortStart: block.TURNRelayPortStart, + RelayPortEnd: block.TURNRelayPortEnd, + TURNDomain: turnDomain, + StealthDomain: stealthDomain, + } + if err := cm.spawnTURNOnNode(ctx, *node, cluster.NamespaceName, turnCfg); err != nil { + return fmt.Errorf("failed to re-spawn TURN on node %s: %w", node.NodeID, err) + } + } + return nil +} + +// rollbackStealthEnable best-effort reverts the DB flag + DNS records after a +// failed stealth enable, so the system never advertises a half-built path. +func (cm *ClusterManager) rollbackStealthEnable(ctx context.Context, clusterID, namespaceName string) { + if err := cm.setStealthEnabled(ctx, clusterID, false); err != nil { + cm.logger.Warn("Stealth rollback: failed to clear stealth_enabled", zap.Error(err)) + } + if err := cm.dnsManager.DeleteStealthTURNRecords(ctx, namespaceName); err != nil { + cm.logger.Warn("Stealth rollback: failed to delete DNS records", zap.Error(err)) + } +} + +// refreshStateAndGateways rewrites cluster-state.json on all nodes with the +// new stealth domain and restarts the namespace gateways so turn.credentials +// reflects the change. Failures are logged per node (the reconciler converges +// stragglers later via the gatewayConfigInSync drift check). +func (cm *ClusterManager) refreshStateAndGateways( + ctx context.Context, + cluster *NamespaceCluster, + clusterNodes []clusterNodeInfo, + stealthDomain, turnSecret string, +) { + turnDomain := fmt.Sprintf("turn.ns-%s.%s", cluster.NamespaceName, cm.baseDomain) + + sfuBlockList, err := cm.getWebRTCBlocksByType(ctx, cluster.ID, "sfu") + if err != nil { + cm.logger.Warn("Failed to get SFU allocations for state refresh", zap.Error(err)) + } + turnBlockList, err := cm.getWebRTCBlocksByType(ctx, cluster.ID, "turn") + if err != nil { + cm.logger.Warn("Failed to get TURN allocations for state refresh", zap.Error(err)) + } + sfuBlocks := make(map[string]*WebRTCPortBlock) + for i := range sfuBlockList { + sfuBlocks[sfuBlockList[i].NodeID] = &sfuBlockList[i] + } + turnBlocks := make(map[string]*WebRTCPortBlock) + for i := range turnBlockList { + turnBlocks[turnBlockList[i].NodeID] = &turnBlockList[i] + } + + cm.updateClusterStateWithWebRTC(ctx, cluster, clusterNodes, sfuBlocks, turnBlocks, turnDomain, stealthDomain, turnSecret) + + portBlocks, err := cm.portAllocator.GetAllPortBlocks(ctx, cluster.ID) + if err != nil { + cm.logger.Warn("Failed to get port blocks for gateway restart after stealth toggle", zap.Error(err)) + return + } + nodePortBlocks := make(map[string]*PortBlock) + for i := range portBlocks { + nodePortBlocks[portBlocks[i].NodeID] = &portBlocks[i] + } + cm.restartGatewaysWithWebRTC(ctx, cluster, clusterNodes, nodePortBlocks, sfuBlocks, turnDomain, stealthDomain, turnSecret) +} diff --git a/core/pkg/namespace/cluster_manager_webrtc.go b/core/pkg/namespace/cluster_manager_webrtc.go index b1b1859..8aa1005 100644 --- a/core/pkg/namespace/cluster_manager_webrtc.go +++ b/core/pkg/namespace/cluster_manager_webrtc.go @@ -204,10 +204,10 @@ func (cm *ClusterManager) EnableWebRTC(ctx context.Context, namespaceName, enabl } // 14. Update cluster-state.json on all nodes with WebRTC info - cm.updateClusterStateWithWebRTC(ctx, cluster, clusterNodes, sfuBlocks, turnBlocks, turnDomain, turnSecret) + cm.updateClusterStateWithWebRTC(ctx, cluster, clusterNodes, sfuBlocks, turnBlocks, turnDomain, "", turnSecret) // 15. Restart namespace gateways with WebRTC config so they register WebRTC routes - cm.restartGatewaysWithWebRTC(ctx, cluster, clusterNodes, nodePortBlocks, sfuBlocks, turnDomain, turnSecret) + cm.restartGatewaysWithWebRTC(ctx, cluster, clusterNodes, nodePortBlocks, sfuBlocks, turnDomain, "", turnSecret) cm.logEvent(ctx, cluster.ID, EventWebRTCEnabled, "", fmt.Sprintf("WebRTC enabled: SFU on %d nodes, TURN on %d nodes", len(clusterNodes), len(turnNodes)), nil) @@ -273,17 +273,23 @@ func (cm *ClusterManager) DisableWebRTC(ctx context.Context, namespaceName strin cm.logger.Warn("Failed to deallocate WebRTC ports", zap.Error(err)) } - // 7. Delete TURN DNS records + // 7. Delete TURN DNS records (both the regular and the feat-124 stealth + // records — a full WebRTC teardown must not orphan stealth A records when + // the namespace had stealth enabled). Delete-by-tag is a no-op when the + // stealth records are absent, so this is safe unconditionally. if err := cm.dnsManager.DeleteTURNRecords(ctx, namespaceName); err != nil { cm.logger.Warn("Failed to delete TURN DNS records", zap.Error(err)) } + if err := cm.dnsManager.DeleteStealthTURNRecords(ctx, namespaceName); err != nil { + cm.logger.Warn("Failed to delete stealth TURN DNS records", zap.Error(err)) + } // 8. Clean up DB tables cm.db.Exec(internalCtx, `DELETE FROM webrtc_rooms WHERE namespace_cluster_id = ?`, cluster.ID) cm.db.Exec(internalCtx, `DELETE FROM namespace_webrtc_config WHERE namespace_cluster_id = ?`, cluster.ID) // 9. Update cluster-state.json to remove WebRTC info - cm.updateClusterStateWithWebRTC(ctx, cluster, clusterNodes, nil, nil, "", "") + cm.updateClusterStateWithWebRTC(ctx, cluster, clusterNodes, nil, nil, "", "", "") // 10. Restart namespace gateways without WebRTC config so they unregister WebRTC routes portBlocks, err := cm.portAllocator.GetAllPortBlocks(ctx, cluster.ID) @@ -292,7 +298,7 @@ func (cm *ClusterManager) DisableWebRTC(ctx context.Context, namespaceName strin for i := range portBlocks { nodePortBlocks[portBlocks[i].NodeID] = &portBlocks[i] } - cm.restartGatewaysWithWebRTC(ctx, cluster, clusterNodes, nodePortBlocks, nil, "", "") + cm.restartGatewaysWithWebRTC(ctx, cluster, clusterNodes, nodePortBlocks, nil, "", "", "") } else { cm.logger.Warn("Failed to get port blocks for gateway restart after WebRTC disable", zap.Error(err)) } @@ -487,17 +493,18 @@ func (cm *ClusterManager) spawnSFURemote(ctx context.Context, nodeIP string, cfg // spawnTURNRemote sends a spawn-turn request to a remote node func (cm *ClusterManager) spawnTURNRemote(ctx context.Context, nodeIP string, cfg TURNInstanceConfig) error { _, err := cm.sendSpawnRequest(ctx, nodeIP, map[string]interface{}{ - "action": "spawn-turn", - "namespace": cfg.Namespace, - "node_id": cfg.NodeID, - "turn_listen_addr": cfg.ListenAddr, - "turn_turns_addr": cfg.TURNSListenAddr, - "turn_public_ip": cfg.PublicIP, - "turn_realm": cfg.Realm, - "turn_auth_secret": cfg.AuthSecret, - "turn_relay_start": cfg.RelayPortStart, - "turn_relay_end": cfg.RelayPortEnd, - "turn_domain": cfg.TURNDomain, + "action": "spawn-turn", + "namespace": cfg.Namespace, + "node_id": cfg.NodeID, + "turn_listen_addr": cfg.ListenAddr, + "turn_turns_addr": cfg.TURNSListenAddr, + "turn_public_ip": cfg.PublicIP, + "turn_realm": cfg.Realm, + "turn_auth_secret": cfg.AuthSecret, + "turn_relay_start": cfg.RelayPortStart, + "turn_relay_end": cfg.RelayPortEnd, + "turn_domain": cfg.TURNDomain, + "turn_stealth_domain": cfg.StealthDomain, }) return err } @@ -558,7 +565,7 @@ func (cm *ClusterManager) updateClusterStateWithWebRTC( nodes []clusterNodeInfo, sfuBlocks map[string]*WebRTCPortBlock, turnBlocks map[string]*WebRTCPortBlock, - turnDomain, turnSecret string, + turnDomain, turnStealthDomain, turnSecret string, ) { // Get existing port blocks for base state portBlocks, err := cm.portAllocator.GetAllPortBlocks(ctx, cluster.ID) @@ -635,6 +642,7 @@ func (cm *ClusterManager) updateClusterStateWithWebRTC( } // Persist TURN domain and secret so gateways can be restored on cold start state.TURNDomain = turnDomain + state.TURNStealthDomain = turnStealthDomain state.TURNSharedSecret = turnSecret if node.NodeID == cm.localNodeID { @@ -671,7 +679,7 @@ func (cm *ClusterManager) restartGatewaysWithWebRTC( nodes []clusterNodeInfo, portBlocks map[string]*PortBlock, sfuBlocks map[string]*WebRTCPortBlock, - turnDomain, turnSecret string, + turnDomain, turnStealthDomain, turnSecret string, ) { // Build Olric server addresses from port blocks + node IPs var olricServers []string @@ -715,6 +723,7 @@ func (cm *ClusterManager) restartGatewaysWithWebRTC( WebRTCEnabled: webrtcEnabled, SFUPort: sfuPort, TURNDomain: turnDomain, + TURNStealthDomain: turnStealthDomain, TURNSecret: turnSecret, // Bugboard #837 follow-up: preserve the secrets key on WebRTC // restarts so enabling WebRTC doesn't drop secrets management. @@ -750,23 +759,24 @@ func (cm *ClusterManager) restartGatewayRemote(ctx context.Context, nodeIP strin } _, err := cm.sendSpawnRequest(ctx, nodeIP, map[string]interface{}{ - "action": "restart-gateway", - "namespace": cfg.Namespace, - "node_id": cfg.NodeID, - "gateway_http_port": cfg.HTTPPort, - "gateway_base_domain": cfg.BaseDomain, - "gateway_rqlite_dsn": cfg.RQLiteDSN, - "gateway_global_rqlite_dsn": cfg.GlobalRQLiteDSN, - "gateway_olric_servers": cfg.OlricServers, - "gateway_olric_timeout": olricTimeout, - "ipfs_cluster_api_url": cfg.IPFSClusterAPIURL, - "ipfs_api_url": cfg.IPFSAPIURL, - "ipfs_timeout": ipfsTimeout, - "ipfs_replication_factor": cfg.IPFSReplicationFactor, - "gateway_webrtc_enabled": cfg.WebRTCEnabled, - "gateway_sfu_port": cfg.SFUPort, - "gateway_turn_domain": cfg.TURNDomain, - "gateway_turn_secret": cfg.TURNSecret, + "action": "restart-gateway", + "namespace": cfg.Namespace, + "node_id": cfg.NodeID, + "gateway_http_port": cfg.HTTPPort, + "gateway_base_domain": cfg.BaseDomain, + "gateway_rqlite_dsn": cfg.RQLiteDSN, + "gateway_global_rqlite_dsn": cfg.GlobalRQLiteDSN, + "gateway_olric_servers": cfg.OlricServers, + "gateway_olric_timeout": olricTimeout, + "ipfs_cluster_api_url": cfg.IPFSClusterAPIURL, + "ipfs_api_url": cfg.IPFSAPIURL, + "ipfs_timeout": ipfsTimeout, + "ipfs_replication_factor": cfg.IPFSReplicationFactor, + "gateway_webrtc_enabled": cfg.WebRTCEnabled, + "gateway_sfu_port": cfg.SFUPort, + "gateway_turn_domain": cfg.TURNDomain, + "gateway_turn_stealth_domain": cfg.TURNStealthDomain, + "gateway_turn_secret": cfg.TURNSecret, // Bugboard #837 follow-up: preserve the secrets key on WebRTC restarts. "gateway_secrets_encryption_key": cfg.SecretsEncryptionKey, }) diff --git a/core/pkg/namespace/cluster_recovery.go b/core/pkg/namespace/cluster_recovery.go index e367018..b98acaf 100644 --- a/core/pkg/namespace/cluster_recovery.go +++ b/core/pkg/namespace/cluster_recovery.go @@ -537,6 +537,7 @@ func (cm *ClusterManager) ReplaceClusterNode(ctx context.Context, cluster *Names gwCfg.SFUPort = sfuBlock.SFUSignalingPort gwCfg.TURNDomain = fmt.Sprintf("turn.ns-%s.%s", cluster.NamespaceName, cm.baseDomain) gwCfg.TURNSecret = webrtcCfg.TURNSharedSecret + gwCfg.TURNStealthDomain = cm.stealthDomainFor(cluster.NamespaceName, webrtcCfg) } } @@ -1080,6 +1081,7 @@ func (cm *ClusterManager) addNodeToCluster( gwCfg.SFUPort = sfuBlock.SFUSignalingPort gwCfg.TURNDomain = fmt.Sprintf("turn.ns-%s.%s", cluster.NamespaceName, cm.baseDomain) gwCfg.TURNSecret = webrtcCfg.TURNSharedSecret + gwCfg.TURNStealthDomain = cm.stealthDomainFor(cluster.NamespaceName, webrtcCfg) } } diff --git a/core/pkg/namespace/dns_manager.go b/core/pkg/namespace/dns_manager.go index b93f0d4..65ec955 100644 --- a/core/pkg/namespace/dns_manager.go +++ b/core/pkg/namespace/dns_manager.go @@ -353,6 +353,78 @@ func (drm *DNSRecordManager) DeleteTURNRecords(ctx context.Context, namespaceNam return nil } +// stealthDNSNamespace is the dns_records ownership tag for a namespace's +// stealth TURNS records, distinct from "namespace-turn:" so deleting one set +// never touches the other. +func stealthDNSNamespace(namespaceName string) string { + return "namespace-turn-stealth:" + namespaceName +} + +// CreateStealthTURNRecords creates DNS A records for the stealth TURNS host +// (feat-124): -> TURN node IPs. The hostname is the neutral +// cdn-. label from turn.StealthHostForNamespace — it lives +// directly under the base domain (NOT under ns-) so the SNI string +// never identifies the app. +func (drm *DNSRecordManager) CreateStealthTURNRecords(ctx context.Context, namespaceName, stealthHost string, turnIPs []string) error { + internalCtx := client.WithInternalAuth(ctx) + + if stealthHost == "" { + return &ClusterError{Message: "no stealth host provided for DNS records"} + } + if len(turnIPs) == 0 { + return &ClusterError{Message: "no TURN IPs provided for stealth DNS records"} + } + + fqdn := stealthHost + "." + + drm.logger.Info("Creating stealth TURNS DNS records", + zap.String("namespace", namespaceName), + zap.String("fqdn", fqdn), + zap.Strings("turn_ips", turnIPs), + ) + + deleteQuery := `DELETE FROM dns_records WHERE namespace = ?` + _, _ = drm.db.Exec(internalCtx, deleteQuery, stealthDNSNamespace(namespaceName)) + + now := time.Now() + for _, ip := range turnIPs { + insertQuery := ` + INSERT INTO dns_records ( + fqdn, record_type, value, ttl, namespace, created_by, created_at, updated_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?) + ` + _, err := drm.db.Exec(internalCtx, insertQuery, + fqdn, "A", ip, 60, + stealthDNSNamespace(namespaceName), + "cluster-manager", + now, now, + ) + if err != nil { + return &ClusterError{ + Message: fmt.Sprintf("failed to create stealth TURNS DNS record %s -> %s", fqdn, ip), + Cause: err, + } + } + } + + return nil +} + +// DeleteStealthTURNRecords deletes a namespace's stealth TURNS DNS records. +func (drm *DNSRecordManager) DeleteStealthTURNRecords(ctx context.Context, namespaceName string) error { + internalCtx := client.WithInternalAuth(ctx) + + deleteQuery := `DELETE FROM dns_records WHERE namespace = ?` + _, err := drm.db.Exec(internalCtx, deleteQuery, stealthDNSNamespace(namespaceName)) + if err != nil { + return &ClusterError{ + Message: "failed to delete stealth TURNS DNS records", + Cause: err, + } + } + return nil +} + // EnableNamespaceRecord marks a specific IP's record as active (for recovery) func (drm *DNSRecordManager) EnableNamespaceRecord(ctx context.Context, namespaceName, ip string) error { internalCtx := client.WithInternalAuth(ctx) diff --git a/core/pkg/namespace/reconcile_gateway_test.go b/core/pkg/namespace/reconcile_gateway_test.go index 4e88aa4..3cd93e8 100644 --- a/core/pkg/namespace/reconcile_gateway_test.go +++ b/core/pkg/namespace/reconcile_gateway_test.go @@ -55,7 +55,7 @@ func TestGatewayWebRTCInSync_matchingBlock_returnsTrue(t *testing.T) { func TestGatewayWebRTCInSync_eachFieldDriftDetected(t *testing.T) { // Any single drifted field must trigger a restart. Pins that the - // comparison covers all four webrtc fields (a future refactor that + // comparison covers all five webrtc fields (a future refactor that // drops one would silently let that field drift forever). base := gateway.GatewayYAMLWebRTC{ Enabled: true, SFUPort: 30000, @@ -69,6 +69,7 @@ func TestGatewayWebRTCInSync_eachFieldDriftDetected(t *testing.T) { {"sfu port changed", func(w *gateway.GatewayYAMLWebRTC) { w.SFUPort = 30001 }}, {"turn domain changed", func(w *gateway.GatewayYAMLWebRTC) { w.TURNDomain = "turn.other" }}, {"turn secret rotated", func(w *gateway.GatewayYAMLWebRTC) { w.TURNSecret = "rotated" }}, + {"stealth domain changed", func(w *gateway.GatewayYAMLWebRTC) { w.TURNStealthDomain = "cdn-deadbeef0000.orama-devnet.network" }}, } for _, tc := range mutations { t.Run(tc.name, func(t *testing.T) { @@ -190,3 +191,25 @@ func TestReconcileGateway_missingConfigReturnsErrorNotRestart(t *testing.T) { t.Error("missing config must return an error (don't blind-restart a healthy gateway)") } } + +func TestGatewayWebRTCInSync_stealthEnableDetectedAsDrift(t *testing.T) { + // feat-124: enabling stealth must drift an otherwise-matching gateway so + // the reconciler rewrites its yaml with turn_stealth_domain and restarts + // it — that's how turn.credentials starts advertising turns::443. + onDisk := gateway.GatewayYAMLWebRTC{ + Enabled: true, SFUPort: 30000, + TURNDomain: "turn.ns-anchat-test.orama-devnet.network", TURNSecret: "the-secret", + } + desired := desiredEnabled() + desired.TURNStealthDomain = "cdn-abc123def456.orama-devnet.network" + if gatewayWebRTCInSync(onDisk, desired) { + t.Error("stealth enable not detected as drift — gateway would never advertise the stealth URI") + } + + // And once the yaml carries it, the same desired config is in-sync (no + // restart loop). + onDisk.TURNStealthDomain = desired.TURNStealthDomain + if !gatewayWebRTCInSync(onDisk, desired) { + t.Error("matching stealth domain reported as drift — restart loop") + } +} diff --git a/core/pkg/namespace/restore_webrtc_test.go b/core/pkg/namespace/restore_webrtc_test.go index 43e5238..9ab0b8c 100644 --- a/core/pkg/namespace/restore_webrtc_test.go +++ b/core/pkg/namespace/restore_webrtc_test.go @@ -11,11 +11,11 @@ import "testing" // port is per-node (0 on a gateway-only node). Pins both the drift // fallback and the non-SFU-gateway case. -// dbFetch signature: () -> (turnSecret, turnDomain string, sfuPort int). -func dbNone() (string, string, int) { return "", "", 0 } +// dbFetch signature: () -> (turnSecret, turnDomain, stealthDomain string, sfuPort int). +func dbNone() (string, string, string, int) { return "", "", "", 0 } -func dbFull(secret, domain string, sfuPort int) func() (string, string, int) { - return func() (string, string, int) { return secret, domain, sfuPort } +func dbFull(secret, domain string, sfuPort int) func() (string, string, string, int) { + return func() (string, string, string, int) { return secret, domain, "", sfuPort } } func TestChooseRestoreWebRTC_stateFileCompleteWins(t *testing.T) { @@ -23,8 +23,8 @@ func TestChooseRestoreWebRTC_stateFileCompleteWins(t *testing.T) { // (the lazy dbFetch must not be called — saves a query on the hot // restart path). dbCalled := false - got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", - func() (string, string, int) { dbCalled = true; return dbNone() }) + got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "", + func() (string, string, string, int) { dbCalled = true; return dbNone() }) if dbCalled { t.Error("DB fetch was called even though the state file had the TURN secret (should short-circuit)") @@ -41,7 +41,7 @@ func TestChooseRestoreWebRTC_staleStateFallsBackToDB(t *testing.T) { // The bug-25 drift case: state file has NO webrtc (stale — written // before enable), DB says enabled WITH an SFU port on this node. MUST // fall back to the DB and re-materialize the full block. - got := chooseRestoreWebRTC(false, 0, "", "", + got := chooseRestoreWebRTC(false, 0, "", "", "", dbFull("db-secret", "turn.ns-anchat-test.dbrs.space", 7801)) if !got.enabled { @@ -65,7 +65,7 @@ func TestChooseRestoreWebRTC_nonSFUGatewayGetsTURNOnly(t *testing.T) { // secret (so /v1/webrtc/turn/credentials registers + works) while // sfuPort stays 0 (signal/rooms don't register). This is exactly node // 57's situation — pre-fix it resolved to disabled and 404'd. - got := chooseRestoreWebRTC(false, 0, "", "", + got := chooseRestoreWebRTC(false, 0, "", "", "", dbFull("db-secret", "turn.ns-anchat-test.dbrs.space", 0)) // sfuPort 0 = no local SFU if !got.enabled { @@ -84,8 +84,8 @@ func TestChooseRestoreWebRTC_stateHasTURNButNoSFU(t *testing.T) { // false / port 0. Must use the state TURN secret with sfuPort=0 and // NOT consult the DB (TURN secret present = complete enough). dbCalled := false - got := chooseRestoreWebRTC(false, 0, "turn.ns-x.dbrs.space", "state-secret", - func() (string, string, int) { dbCalled = true; return dbNone() }) + got := chooseRestoreWebRTC(false, 0, "turn.ns-x.dbrs.space", "state-secret", "", + func() (string, string, string, int) { dbCalled = true; return dbNone() }) if dbCalled { t.Error("DB fetch called even though state file had the TURN secret") @@ -98,7 +98,7 @@ func TestChooseRestoreWebRTC_stateHasTURNButNoSFU(t *testing.T) { func TestChooseRestoreWebRTC_bothEmptyDisabled(t *testing.T) { // Namespace genuinely without WebRTC: state empty, DB returns nothing. // Must return disabled so we don't register broken webrtc routes. - got := chooseRestoreWebRTC(false, 0, "", "", dbNone) + got := chooseRestoreWebRTC(false, 0, "", "", "", dbNone) if got.enabled { t.Errorf("want disabled when neither source has WebRTC; got %+v", got) } @@ -109,8 +109,8 @@ func TestChooseRestoreWebRTC_dbNoSecretStaysDisabled(t *testing.T) { // provisioned / shouldn't happen). The TURN secret is the // enablement marker; without it we treat it as not-configured-for- // TURN, but an SFU port alone still enables SFU routes. - got := chooseRestoreWebRTC(false, 0, "", "", - func() (string, string, int) { return "", "turn.db", 9000 }) + got := chooseRestoreWebRTC(false, 0, "", "", "", + func() (string, string, string, int) { return "", "turn.db", "", 9000 }) // dbFetch only runs when state secret is empty; here it returns no // secret, so the `if dbSecret != ""` guard means NOTHING is taken // from the DB → disabled. (An SFU-only-no-TURN namespace is not a @@ -119,3 +119,39 @@ func TestChooseRestoreWebRTC_dbNoSecretStaysDisabled(t *testing.T) { t.Errorf("DB returned no TURN secret: want disabled; got %+v", got) } } + +// --- feat-124 stealth domain restore precedence --- + +func TestChooseRestoreWebRTC_stealthFromStateFile(t *testing.T) { + // Stealth toggles rewrite cluster state, so a fresh state file carries + // the stealth domain and must win without a DB call. + got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "cdn-abc123def456.dbrs.space", + func() (string, string, string, int) { + t.Error("DB fetch called even though state file was complete") + return dbNone() + }) + if got.stealthDomain != "cdn-abc123def456.dbrs.space" { + t.Errorf("stealthDomain = %q; want state-file value", got.stealthDomain) + } +} + +func TestChooseRestoreWebRTC_stealthFromDBOnStaleState(t *testing.T) { + // Stale state (no TURN secret) + DB has stealth enabled → stealth domain + // re-materializes from the DB alongside the rest of the WebRTC block. + got := chooseRestoreWebRTC(false, 0, "", "", "", + func() (string, string, string, int) { + return "db-secret", "turn.ns-x.dbrs.space", "cdn-abc123def456.dbrs.space", 7801 + }) + if !got.enabled || got.stealthDomain != "cdn-abc123def456.dbrs.space" { + t.Errorf("want stealth domain from DB on stale state; got %+v", got) + } +} + +func TestChooseRestoreWebRTC_noStealthStaysEmpty(t *testing.T) { + // Stealth disabled everywhere → empty stealthDomain (gateway advertises + // the baseline 3-rung ladder only). + got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "", dbNone) + if got.stealthDomain != "" { + t.Errorf("stealthDomain = %q; want empty when stealth is disabled", got.stealthDomain) + } +} diff --git a/core/pkg/namespace/systemd_spawner.go b/core/pkg/namespace/systemd_spawner.go index c96d7f2..d4e0399 100644 --- a/core/pkg/namespace/systemd_spawner.go +++ b/core/pkg/namespace/systemd_spawner.go @@ -234,10 +234,11 @@ func (s *SystemdSpawner) SpawnGateway(ctx context.Context, namespace, nodeID str // namespace gateways even though the host gateway had the key. SecretsEncryptionKey: cfg.SecretsEncryptionKey, WebRTC: gateway.GatewayYAMLWebRTC{ - Enabled: cfg.WebRTCEnabled, - SFUPort: cfg.SFUPort, - TURNDomain: cfg.TURNDomain, - TURNSecret: cfg.TURNSecret, + Enabled: cfg.WebRTCEnabled, + SFUPort: cfg.SFUPort, + TURNDomain: cfg.TURNDomain, + TURNSecret: cfg.TURNSecret, + TURNStealthDomain: cfg.TURNStealthDomain, }, } @@ -343,7 +344,8 @@ func gatewayWebRTCInSync(onDisk gateway.GatewayYAMLWebRTC, cfg gateway.InstanceC return onDisk.Enabled == cfg.WebRTCEnabled && onDisk.SFUPort == cfg.SFUPort && onDisk.TURNSecret == cfg.TURNSecret && - onDisk.TURNDomain == cfg.TURNDomain + onDisk.TURNDomain == cfg.TURNDomain && + onDisk.TURNStealthDomain == cfg.TURNStealthDomain } // gatewayConfigInSync reports whether the full reconcile-relevant config on @@ -516,6 +518,68 @@ type TURNInstanceConfig struct { RelayPortStart int // Start of relay port range RelayPortEnd int // End of relay port range TURNDomain string // TURN domain for Let's Encrypt cert (e.g., "turn.ns-myapp.orama-devnet.network") + // StealthDomain is the neutral stealth TURNS host (feat-124). When set, + // the TURN server carries a second Let's Encrypt cert for this name and + // serves it to TLS clients whose SNI matches — the path the SNI router + // forwards from :443. Stealth NEVER falls back to a self-signed cert: a + // cert clients reject is indistinguishable from being blocked. + StealthDomain string +} + +// acmeInternalEndpoint is the gateway's internal ACME endpoint that the +// Caddyfile TURN-cert blocks point the orama DNS provider at. +const acmeInternalEndpoint = "http://localhost:6001/v1/internal/acme" + +// turnCertProvisionTimeout bounds how long a TURN spawn waits for Caddy to +// provision a Let's Encrypt cert before falling back (primary domain) or +// failing (stealth domain). +const turnCertProvisionTimeout = 2 * time.Minute + +// resolveTURNSCert resolves the TURNS cert/key pair for a domain. +// +// Let's Encrypt via Caddy is tried FIRST whenever a domain is set — the call +// is idempotent and instant when the cert is already in Caddy's storage. This +// ordering also self-heals nodes stuck on the self-signed fallback from an +// earlier failed provisioning (live devnet finding, feat-124): the old code +// never retried Caddy once a self-signed pair existed on disk, so strict TLS +// clients kept failing turns: validation forever. +// +// allowSelfSigned controls the fallback: the primary TURN domain may fall +// back to (or reuse) a self-signed pair at /turn-{cert,key}.pem so +// baseline TURN stays up, while the stealth domain must hard-fail instead. +func (s *SystemdSpawner) resolveTURNSCert(namespace, domain, publicIP, configDir string, allowSelfSigned bool) (string, string, error) { + if domain != "" { + caddyCert, caddyKey, err := provisionTURNCertViaCaddy(domain, acmeInternalEndpoint, turnCertProvisionTimeout) + if err == nil { + s.logger.Info("Using Let's Encrypt cert from Caddy for TURNS", + zap.String("namespace", namespace), + zap.String("domain", domain), + zap.String("cert_path", caddyCert)) + return caddyCert, caddyKey, nil + } + if !allowSelfSigned { + return "", "", fmt.Errorf("failed to provision Let's Encrypt cert for stealth TURNS domain %s (no self-signed fallback — clients must be able to validate it): %w", domain, err) + } + s.logger.Warn("Let's Encrypt cert provisioning failed, falling back to self-signed", + zap.String("namespace", namespace), + zap.String("domain", domain), + zap.Error(err)) + } + if !allowSelfSigned { + return "", "", fmt.Errorf("no domain configured for TURNS cert in namespace %s", namespace) + } + + certPath := filepath.Join(configDir, "turn-cert.pem") + keyPath := filepath.Join(configDir, "turn-key.pem") + if _, err := os.Stat(certPath); os.IsNotExist(err) { + if err := turn.GenerateSelfSignedCert(certPath, keyPath, publicIP); err != nil { + return "", "", fmt.Errorf("failed to generate TURNS self-signed cert for namespace %s: %w", namespace, err) + } + s.logger.Info("Generated TURNS self-signed certificate", + zap.String("namespace", namespace), + zap.String("cert_path", certPath)) + } + return certPath, keyPath, nil } // SpawnTURN starts a TURN instance using systemd @@ -534,43 +598,48 @@ func (s *SystemdSpawner) SpawnTURN(ctx context.Context, namespace, nodeID string configPath := filepath.Join(configDir, fmt.Sprintf("turn-%s.yaml", nodeID)) - // Provision TLS cert for TURNS — try Let's Encrypt via Caddy first, fall back to self-signed - certPath := filepath.Join(configDir, "turn-cert.pem") - keyPath := filepath.Join(configDir, "turn-key.pem") + // Provision TLS cert for TURNS — Let's Encrypt via Caddy first (idempotent, + // also upgrades nodes stuck on the self-signed fallback), self-signed as + // the primary-domain fallback only. + var certPath, keyPath string if cfg.TURNSListenAddr != "" { - if _, err := os.Stat(certPath); os.IsNotExist(err) { - // Try Let's Encrypt via Caddy first - if cfg.TURNDomain != "" { - acmeEndpoint := "http://localhost:6001/v1/internal/acme" - caddyCert, caddyKey, provErr := provisionTURNCertViaCaddy(cfg.TURNDomain, acmeEndpoint, 2*time.Minute) - if provErr == nil { - certPath = caddyCert - keyPath = caddyKey - s.logger.Info("Using Let's Encrypt cert from Caddy for TURNS", - zap.String("namespace", namespace), - zap.String("domain", cfg.TURNDomain), - zap.String("cert_path", certPath)) - } else { - s.logger.Warn("Let's Encrypt cert provisioning failed, falling back to self-signed", - zap.String("namespace", namespace), - zap.String("domain", cfg.TURNDomain), - zap.Error(provErr)) - } - } - // Fallback: generate self-signed cert if no cert is available yet - if _, statErr := os.Stat(certPath); os.IsNotExist(statErr) { - if err := turn.GenerateSelfSignedCert(certPath, keyPath, cfg.PublicIP); err != nil { - s.logger.Warn("Failed to generate TURNS self-signed cert, TURNS will be disabled", - zap.String("namespace", namespace), - zap.Error(err)) - cfg.TURNSListenAddr = "" // Disable TURNS if cert generation fails - } else { - s.logger.Info("Generated TURNS self-signed certificate", - zap.String("namespace", namespace), - zap.String("cert_path", certPath)) - } + var certErr error + certPath, keyPath, certErr = s.resolveTURNSCert(namespace, cfg.TURNDomain, cfg.PublicIP, configDir, true) + if certErr != nil { + s.logger.Warn("Failed to resolve TURNS cert, TURNS will be disabled", + zap.String("namespace", namespace), + zap.Error(certErr)) + cfg.TURNSListenAddr = "" // Disable TURNS if no cert is available + } + } + + // Stealth TURNS cert (feat-124): requires a working TURNS listener and a + // CA-valid cert — hard error, never a silent downgrade, because the + // operator explicitly enabled stealth and a half-working stealth endpoint + // is invisible until a censored-region user fails to connect. + var stealthCertPath, stealthKeyPath string + if cfg.StealthDomain != "" { + // Security: the stealth domain arrives over the spawn protocol (mesh + // peers gated only by the static internal-auth header). Before it + // reaches the Caddyfile/ACME sink, pin it to the deterministic + // derivation so a forged value can't drive cert issuance for an + // attacker-chosen name. cfg.Realm is the base domain on every TURN + // spawn site. (provisionTURNCertViaCaddy adds a DNS-name allowlist as + // defense-in-depth.) + if cfg.Realm != "" { + want := turn.StealthHostForNamespace(cfg.Namespace, cfg.Realm) + if cfg.StealthDomain != want { + return fmt.Errorf("stealth domain %q does not match the derived host %q for namespace %s — refusing to provision", cfg.StealthDomain, want, cfg.Namespace) } } + if cfg.TURNSListenAddr == "" { + return fmt.Errorf("stealth TURNS for namespace %s requires an active TURNS listener (no TLS cert/listener available)", namespace) + } + var stealthErr error + stealthCertPath, stealthKeyPath, stealthErr = s.resolveTURNSCert(namespace, cfg.StealthDomain, cfg.PublicIP, configDir, false) + if stealthErr != nil { + return fmt.Errorf("failed to provision stealth TURNS cert for namespace %s: %w", namespace, stealthErr) + } } // Build TURN YAML config @@ -588,6 +657,11 @@ func (s *SystemdSpawner) SpawnTURN(ctx context.Context, namespace, nodeID string turnConfig.TLSCertPath = certPath turnConfig.TLSKeyPath = keyPath } + if stealthCertPath != "" { + turnConfig.StealthDomain = cfg.StealthDomain + turnConfig.TLSStealthCertPath = stealthCertPath + turnConfig.TLSStealthKeyPath = stealthKeyPath + } configBytes, err := yaml.Marshal(turnConfig) if err != nil { diff --git a/core/pkg/namespace/turn_cert.go b/core/pkg/namespace/turn_cert.go index 00ac1ed..7b940e7 100644 --- a/core/pkg/namespace/turn_cert.go +++ b/core/pkg/namespace/turn_cert.go @@ -5,10 +5,20 @@ import ( "os" "os/exec" "path/filepath" + "regexp" "strings" "time" ) +// dnsNamePattern matches a conservative lowercase DNS hostname. It exists to +// keep an operator/spawn-supplied domain from breaking out of the Caddyfile +// block it is interpolated into (a value containing '{', '}', or a newline +// could otherwise inject arbitrary Caddy directives) and to refuse cert +// provisioning for non-hostname junk. Security: defense-in-depth at the +// Caddyfile sink; the caller also pins the stealth domain to its deterministic +// derivation (systemd_spawner.go SpawnTURN). +var dnsNamePattern = regexp.MustCompile(`^[a-z0-9]([a-z0-9-]*[a-z0-9])?(\.[a-z0-9]([a-z0-9-]*[a-z0-9])?)+$`) + const ( caddyfilePath = "/etc/caddy/Caddyfile" @@ -25,6 +35,12 @@ const ( // If Caddy is not available or cert provisioning times out, returns an error // so the caller can fall back to a self-signed cert. func provisionTURNCertViaCaddy(domain, acmeEndpoint string, timeout time.Duration) (certPath, keyPath string, err error) { + // Refuse anything that isn't a clean DNS name before it reaches the + // Caddyfile write — blocks Caddyfile-injection via crafted domains. + if !dnsNamePattern.MatchString(domain) { + return "", "", fmt.Errorf("refusing to provision TURNS cert for non-DNS-name domain %q", domain) + } + // Check if cert already exists from a previous provisioning certPath, keyPath = caddyCertPaths(domain) if _, err := os.Stat(certPath); err == nil { diff --git a/core/pkg/namespace/turn_stealth_cert_test.go b/core/pkg/namespace/turn_stealth_cert_test.go new file mode 100644 index 0000000..cd6b875 --- /dev/null +++ b/core/pkg/namespace/turn_stealth_cert_test.go @@ -0,0 +1,108 @@ +package namespace + +import ( + "os" + "path/filepath" + "strings" + "testing" + "time" + + "go.uber.org/zap" +) + +// feat-124 — resolveTURNSCert semantics. +// +// On machines without a Caddyfile (tests, dev laptops) the Let's Encrypt +// branch fails fast with "failed to read Caddyfile", exercising exactly the +// fallback decision this function owns: primary domains degrade to a +// self-signed pair, the stealth domain must hard-fail instead. + +func testSpawner(t *testing.T) *SystemdSpawner { + t.Helper() + return &SystemdSpawner{logger: zap.NewNop()} +} + +func TestResolveTURNSCert_primaryFallsBackToSelfSigned(t *testing.T) { + s := testSpawner(t) + dir := t.TempDir() + + certPath, keyPath, err := s.resolveTURNSCert("ns-test", "turn.ns-test.example.com", "203.0.113.7", dir, true) + if err != nil { + t.Fatalf("expected self-signed fallback, got error: %v", err) + } + if certPath != filepath.Join(dir, "turn-cert.pem") || keyPath != filepath.Join(dir, "turn-key.pem") { + t.Errorf("unexpected fallback paths: %s / %s", certPath, keyPath) + } + if _, statErr := os.Stat(certPath); statErr != nil { + t.Errorf("self-signed cert not written: %v", statErr) + } +} + +func TestResolveTURNSCert_existingSelfSignedReused(t *testing.T) { + s := testSpawner(t) + dir := t.TempDir() + + first, _, err := s.resolveTURNSCert("ns-test", "", "203.0.113.7", dir, true) + if err != nil { + t.Fatalf("first resolve: %v", err) + } + info1, err := os.Stat(first) + if err != nil { + t.Fatalf("stat first cert: %v", err) + } + + second, _, err := s.resolveTURNSCert("ns-test", "", "203.0.113.7", dir, true) + if err != nil { + t.Fatalf("second resolve: %v", err) + } + info2, err := os.Stat(second) + if err != nil { + t.Fatalf("stat second cert: %v", err) + } + if first != second || info1.ModTime() != info2.ModTime() { + t.Error("existing self-signed pair was regenerated instead of reused") + } +} + +func TestResolveTURNSCert_stealthNeverFallsBackToSelfSigned(t *testing.T) { + s := testSpawner(t) + dir := t.TempDir() + + _, _, err := s.resolveTURNSCert("ns-test", "cdn-abc123def456.example.com", "203.0.113.7", dir, false) + if err == nil { + t.Fatal("stealth cert resolution must hard-fail without Let's Encrypt — a self-signed stealth cert is indistinguishable from being blocked") + } + if !strings.Contains(err.Error(), "cdn-abc123def456.example.com") { + t.Errorf("error must name the stealth domain for the operator; got: %v", err) + } + if _, statErr := os.Stat(filepath.Join(dir, "turn-cert.pem")); !os.IsNotExist(statErr) { + t.Error("stealth failure must not write a self-signed pair") + } +} + +func TestResolveTURNSCert_noDomainNoFallbackErrors(t *testing.T) { + s := testSpawner(t) + _, _, err := s.resolveTURNSCert("ns-test", "", "203.0.113.7", t.TempDir(), false) + if err == nil { + t.Fatal("empty domain with self-signed disallowed must error") + } +} + +// Security (feat-124): the Caddyfile sink must refuse any domain that isn't a +// clean DNS name, so a crafted value can't break out of the generated block +// and inject Caddy directives. +func TestProvisionTURNCertViaCaddy_rejectsNonDNSName(t *testing.T) { + bad := []string{ + "example.com {\n reverse_proxy evil:1234\n}\n#", + "has space.com", + "UPPER.example.com", + "nodots", + "trailing-.example.com", + "", + } + for _, d := range bad { + if _, _, err := provisionTURNCertViaCaddy(d, "http://localhost:6001/v1/internal/acme", time.Second); err == nil { + t.Errorf("provisionTURNCertViaCaddy(%q) accepted a non-DNS-name domain", d) + } + } +} diff --git a/core/pkg/namespace/types.go b/core/pkg/namespace/types.go index 2ee5550..216408f 100644 --- a/core/pkg/namespace/types.go +++ b/core/pkg/namespace/types.go @@ -94,8 +94,8 @@ const ( const ( // SFU media port range: 20000-29999 // Each namespace gets a 500-port sub-range for RTP media - SFUMediaPortRangeStart = 20000 - SFUMediaPortRangeEnd = 29999 + SFUMediaPortRangeStart = 20000 + SFUMediaPortRangeEnd = 29999 SFUMediaPortsPerNamespace = 500 // SFU signaling ports: 30000-30099 @@ -105,8 +105,8 @@ const ( // TURN relay port range: 49152-65535 // Each namespace gets an 800-port sub-range for TURN relay - TURNRelayPortRangeStart = 49152 - TURNRelayPortRangeEnd = 65535 + TURNRelayPortRangeStart = 49152 + TURNRelayPortRangeEnd = 65535 TURNRelayPortsPerNamespace = 800 // TURN listen ports (standard) @@ -152,38 +152,38 @@ type NamespaceCluster struct { // ClusterNode represents a node participating in a namespace cluster type ClusterNode struct { - ID string `json:"id" db:"id"` - NamespaceClusterID string `json:"namespace_cluster_id" db:"namespace_cluster_id"` - NodeID string `json:"node_id" db:"node_id"` - Role NodeRole `json:"role" db:"role"` - RQLiteHTTPPort int `json:"rqlite_http_port,omitempty" db:"rqlite_http_port"` - RQLiteRaftPort int `json:"rqlite_raft_port,omitempty" db:"rqlite_raft_port"` - OlricHTTPPort int `json:"olric_http_port,omitempty" db:"olric_http_port"` - OlricMemberlistPort int `json:"olric_memberlist_port,omitempty" db:"olric_memberlist_port"` - GatewayHTTPPort int `json:"gateway_http_port,omitempty" db:"gateway_http_port"` - Status NodeStatus `json:"status" db:"status"` - ProcessPID int `json:"process_pid,omitempty" db:"process_pid"` - LastHeartbeat *time.Time `json:"last_heartbeat,omitempty" db:"last_heartbeat"` - ErrorMessage string `json:"error_message,omitempty" db:"error_message"` - RQLiteJoinAddress string `json:"rqlite_join_address,omitempty" db:"rqlite_join_address"` - OlricPeers string `json:"olric_peers,omitempty" db:"olric_peers"` // JSON array - CreatedAt time.Time `json:"created_at" db:"created_at"` - UpdatedAt time.Time `json:"updated_at" db:"updated_at"` + ID string `json:"id" db:"id"` + NamespaceClusterID string `json:"namespace_cluster_id" db:"namespace_cluster_id"` + NodeID string `json:"node_id" db:"node_id"` + Role NodeRole `json:"role" db:"role"` + RQLiteHTTPPort int `json:"rqlite_http_port,omitempty" db:"rqlite_http_port"` + RQLiteRaftPort int `json:"rqlite_raft_port,omitempty" db:"rqlite_raft_port"` + OlricHTTPPort int `json:"olric_http_port,omitempty" db:"olric_http_port"` + OlricMemberlistPort int `json:"olric_memberlist_port,omitempty" db:"olric_memberlist_port"` + GatewayHTTPPort int `json:"gateway_http_port,omitempty" db:"gateway_http_port"` + Status NodeStatus `json:"status" db:"status"` + ProcessPID int `json:"process_pid,omitempty" db:"process_pid"` + LastHeartbeat *time.Time `json:"last_heartbeat,omitempty" db:"last_heartbeat"` + ErrorMessage string `json:"error_message,omitempty" db:"error_message"` + RQLiteJoinAddress string `json:"rqlite_join_address,omitempty" db:"rqlite_join_address"` + OlricPeers string `json:"olric_peers,omitempty" db:"olric_peers"` // JSON array + CreatedAt time.Time `json:"created_at" db:"created_at"` + UpdatedAt time.Time `json:"updated_at" db:"updated_at"` } // PortBlock represents an allocated block of ports for a namespace on a node type PortBlock struct { - ID string `json:"id" db:"id"` - NodeID string `json:"node_id" db:"node_id"` - NamespaceClusterID string `json:"namespace_cluster_id" db:"namespace_cluster_id"` - PortStart int `json:"port_start" db:"port_start"` - PortEnd int `json:"port_end" db:"port_end"` - RQLiteHTTPPort int `json:"rqlite_http_port" db:"rqlite_http_port"` - RQLiteRaftPort int `json:"rqlite_raft_port" db:"rqlite_raft_port"` - OlricHTTPPort int `json:"olric_http_port" db:"olric_http_port"` - OlricMemberlistPort int `json:"olric_memberlist_port" db:"olric_memberlist_port"` - GatewayHTTPPort int `json:"gateway_http_port" db:"gateway_http_port"` - AllocatedAt time.Time `json:"allocated_at" db:"allocated_at"` + ID string `json:"id" db:"id"` + NodeID string `json:"node_id" db:"node_id"` + NamespaceClusterID string `json:"namespace_cluster_id" db:"namespace_cluster_id"` + PortStart int `json:"port_start" db:"port_start"` + PortEnd int `json:"port_end" db:"port_end"` + RQLiteHTTPPort int `json:"rqlite_http_port" db:"rqlite_http_port"` + RQLiteRaftPort int `json:"rqlite_raft_port" db:"rqlite_raft_port"` + OlricHTTPPort int `json:"olric_http_port" db:"olric_http_port"` + OlricMemberlistPort int `json:"olric_memberlist_port" db:"olric_memberlist_port"` + GatewayHTTPPort int `json:"gateway_http_port" db:"gateway_http_port"` + AllocatedAt time.Time `json:"allocated_at" db:"allocated_at"` } // ClusterEvent represents an audit event for cluster lifecycle @@ -238,33 +238,39 @@ func (e *ClusterError) Unwrap() error { } var ( - ErrNoPortsAvailable = &ClusterError{Message: "no ports available on node"} - ErrNodeAtCapacity = &ClusterError{Message: "node has reached maximum namespace instances"} - ErrInsufficientNodes = &ClusterError{Message: "insufficient nodes available for cluster"} - ErrClusterNotFound = &ClusterError{Message: "namespace cluster not found"} - ErrClusterAlreadyExists = &ClusterError{Message: "namespace cluster already exists"} - ErrProvisioningFailed = &ClusterError{Message: "cluster provisioning failed"} - ErrNamespaceNotFound = &ClusterError{Message: "namespace not found"} - ErrInvalidClusterStatus = &ClusterError{Message: "invalid cluster status for operation"} - ErrRecoveryInProgress = &ClusterError{Message: "recovery already in progress for this cluster"} - ErrWebRTCAlreadyEnabled = &ClusterError{Message: "WebRTC is already enabled for this namespace"} - ErrWebRTCNotEnabled = &ClusterError{Message: "WebRTC is not enabled for this namespace"} - ErrNoWebRTCPortsAvailable = &ClusterError{Message: "no WebRTC ports available on node"} + ErrNoPortsAvailable = &ClusterError{Message: "no ports available on node"} + ErrNodeAtCapacity = &ClusterError{Message: "node has reached maximum namespace instances"} + ErrInsufficientNodes = &ClusterError{Message: "insufficient nodes available for cluster"} + ErrClusterNotFound = &ClusterError{Message: "namespace cluster not found"} + ErrClusterAlreadyExists = &ClusterError{Message: "namespace cluster already exists"} + ErrProvisioningFailed = &ClusterError{Message: "cluster provisioning failed"} + ErrNamespaceNotFound = &ClusterError{Message: "namespace not found"} + ErrInvalidClusterStatus = &ClusterError{Message: "invalid cluster status for operation"} + ErrRecoveryInProgress = &ClusterError{Message: "recovery already in progress for this cluster"} + ErrWebRTCAlreadyEnabled = &ClusterError{Message: "WebRTC is already enabled for this namespace"} + ErrWebRTCNotEnabled = &ClusterError{Message: "WebRTC is not enabled for this namespace"} + ErrWebRTCStealthAlreadyEnabled = &ClusterError{Message: "WebRTC stealth is already enabled for this namespace"} + ErrWebRTCStealthNotEnabled = &ClusterError{Message: "WebRTC stealth is not enabled for this namespace"} + ErrNoWebRTCPortsAvailable = &ClusterError{Message: "no WebRTC ports available on node"} ) // WebRTCConfig represents the per-namespace WebRTC configuration stored in the database type WebRTCConfig struct { - ID string `json:"id" db:"id"` - NamespaceClusterID string `json:"namespace_cluster_id" db:"namespace_cluster_id"` - NamespaceName string `json:"namespace_name" db:"namespace_name"` - Enabled bool `json:"enabled" db:"enabled"` - TURNSharedSecret string `json:"-" db:"turn_shared_secret"` // Never serialize secret to JSON - TURNCredentialTTL int `json:"turn_credential_ttl" db:"turn_credential_ttl"` - SFUNodeCount int `json:"sfu_node_count" db:"sfu_node_count"` - TURNNodeCount int `json:"turn_node_count" db:"turn_node_count"` - EnabledBy string `json:"enabled_by" db:"enabled_by"` - EnabledAt time.Time `json:"enabled_at" db:"enabled_at"` - DisabledAt *time.Time `json:"disabled_at,omitempty" db:"disabled_at"` + ID string `json:"id" db:"id"` + NamespaceClusterID string `json:"namespace_cluster_id" db:"namespace_cluster_id"` + NamespaceName string `json:"namespace_name" db:"namespace_name"` + Enabled bool `json:"enabled" db:"enabled"` + TURNSharedSecret string `json:"-" db:"turn_shared_secret"` // Never serialize secret to JSON + TURNCredentialTTL int `json:"turn_credential_ttl" db:"turn_credential_ttl"` + SFUNodeCount int `json:"sfu_node_count" db:"sfu_node_count"` + TURNNodeCount int `json:"turn_node_count" db:"turn_node_count"` + // StealthEnabled gates the censorship-resistant TURNS:443 path (feat-124): + // stealth cert on the TURN servers, SNI route on :443, and the + // `turns::443` rung in the turn.credentials URI ladder. + StealthEnabled bool `json:"stealth_enabled" db:"stealth_enabled"` + EnabledBy string `json:"enabled_by" db:"enabled_by"` + EnabledAt time.Time `json:"enabled_at" db:"enabled_at"` + DisabledAt *time.Time `json:"disabled_at,omitempty" db:"disabled_at"` } // WebRTCRoom represents an active WebRTC room tracked in the database @@ -284,15 +290,15 @@ type WebRTCRoom struct { // WebRTCPortBlock represents allocated WebRTC ports for a namespace on a node type WebRTCPortBlock struct { - ID string `json:"id" db:"id"` - NodeID string `json:"node_id" db:"node_id"` - NamespaceClusterID string `json:"namespace_cluster_id" db:"namespace_cluster_id"` - ServiceType string `json:"service_type" db:"service_type"` // "sfu" or "turn" + ID string `json:"id" db:"id"` + NodeID string `json:"node_id" db:"node_id"` + NamespaceClusterID string `json:"namespace_cluster_id" db:"namespace_cluster_id"` + ServiceType string `json:"service_type" db:"service_type"` // "sfu" or "turn" // SFU ports - SFUSignalingPort int `json:"sfu_signaling_port,omitempty" db:"sfu_signaling_port"` - SFUMediaPortStart int `json:"sfu_media_port_start,omitempty" db:"sfu_media_port_start"` - SFUMediaPortEnd int `json:"sfu_media_port_end,omitempty" db:"sfu_media_port_end"` + SFUSignalingPort int `json:"sfu_signaling_port,omitempty" db:"sfu_signaling_port"` + SFUMediaPortStart int `json:"sfu_media_port_start,omitempty" db:"sfu_media_port_start"` + SFUMediaPortEnd int `json:"sfu_media_port_end,omitempty" db:"sfu_media_port_end"` // TURN ports TURNListenPort int `json:"turn_listen_port,omitempty" db:"turn_listen_port"` diff --git a/core/pkg/serverless/engine.go b/core/pkg/serverless/engine.go index 85a5635..db21803 100644 --- a/core/pkg/serverless/engine.go +++ b/core/pkg/serverless/engine.go @@ -828,6 +828,7 @@ func (e *Engine) registerHostModule(ctx context.Context) error { NewFunctionBuilder().WithFunc(e.hWSBroadcast).Export("ws_broadcast"). NewFunctionBuilder().WithFunc(e.hEphemeralStateSet).Export("ephemeral_state_set"). NewFunctionBuilder().WithFunc(e.hEphemeralStateClear).Export("ephemeral_state_clear"). + NewFunctionBuilder().WithFunc(e.hEphemeralStateList).Export("ephemeral_state_list"). NewFunctionBuilder().WithFunc(e.hFunctionInvoke).Export("function_invoke"). NewFunctionBuilder().WithFunc(e.hFunctionInvokeAsync).Export("function_invoke_async"). NewFunctionBuilder().WithFunc(e.hLogInfo).Export("log_info"). @@ -1463,6 +1464,33 @@ func (e *Engine) hEphemeralStateClear(ctx context.Context, mod api.Module, return 1 } +// hEphemeralStateList is the WASM-callable wrapper for EphemeralStateList — +// the bugboard #710 reconnect catch-up read. +// +// ABI: ephemeral_state_list(topicPtr, topicLen uint32) -> uint64 packed +// (ptr<<32 | len) pointing to a JSON envelope in guest memory: +// +// {"entries":[{"key":..,"client_id":..,"payload":,"expires_in_ms":..}, …]} +// +// Returns 0 on failure (empty topic, no invocation context, ephemeral state +// unavailable, or a guest-memory error). Unlike set/clear, no WS client is +// required — the read is namespace-scoped via the invocation context. +func (e *Engine) hEphemeralStateList(ctx context.Context, mod api.Module, + topicPtr, topicLen uint32) uint64 { + topic, ok := e.executor.ReadFromGuest(mod, topicPtr, topicLen) + if !ok { + return 0 + } + out, err := e.hostServices.EphemeralStateList(ctx, string(topic)) + if err != nil { + e.logger.Warn("host function ephemeral_state_list failed", + zap.String("topic", string(topic)), + zap.Error(err)) + return 0 + } + return e.executor.WriteToGuest(ctx, mod, out) +} + // hPushSend is the WASM-callable wrapper for PushSend. // Inputs: // diff --git a/core/pkg/serverless/ephemeral_state.go b/core/pkg/serverless/ephemeral_state.go index 590e66c..a854f3d 100644 --- a/core/pkg/serverless/ephemeral_state.go +++ b/core/pkg/serverless/ephemeral_state.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "fmt" + "sort" "sync" "time" ) @@ -47,26 +48,29 @@ const ( ephemeralSweepInterval = 10 * time.Second ) -// EphemeralEventKind discriminates the synthetic events published on a topic. -type EphemeralEventKind string - +// Synthetic-event discriminator values carried in the `_orama` field. The +// `_orama` control-frame namespace is the contract agreed with app teams on +// bugboard #710 (#458/#505/#849/#901) — the same dispatch pattern clients +// already use for the auth.refresh control frame from #321. const ( - EphemeralEventSet EphemeralEventKind = "set" - EphemeralEventClear EphemeralEventKind = "clear" + EphemeralEventSet = "ephemeral.set" + EphemeralEventClear = "ephemeral.clear" ) // EphemeralEvent is the wire shape published on the topic when ephemeral state -// is set, cleared, or auto-cleared on disconnect/expiry. Subscribers key off -// Kind + Key to update their local view. Payload is only populated for "set". +// is set, cleared, or auto-cleared on disconnect/expiry. Subscribers dispatch +// on the `_orama` discriminator + Key to update their local view. Payload is +// only populated for "ephemeral.set". type EphemeralEvent struct { - Type string `json:"__ephemeral"` // always "state" - Kind EphemeralEventKind `json:"kind"` // set | clear - Key string `json:"key"` // app-chosen key - ClientID string `json:"client_id"` // owning WS client + Type string `json:"_orama"` // "ephemeral.set" | "ephemeral.clear" + Topic string `json:"topic"` // the topic the state lives on (self-describing for sub-routers) + Key string `json:"key"` // app-chosen key + ClientID string `json:"client_id"` // owning WS client // Payload is the opaque app-chosen blob (may be JSON, protobuf, or - // arbitrary bytes), present only for "set". encoding/json base64-encodes - // a []byte on the wire, so subscribers base64-decode "payload" to recover - // the original bytes — mirroring how pubsub_publish_batch carries data. + // arbitrary bytes), present only for "ephemeral.set". encoding/json + // base64-encodes a []byte on the wire, so subscribers base64-decode + // "payload" to recover the original bytes — mirroring how + // pubsub_publish_batch carries data. Payload []byte `json:"payload,omitempty"` Reason string `json:"reason,omitempty"` // clear only: explicit|disconnect|expired } @@ -192,8 +196,8 @@ func (s *EphemeralStore) Set(ctx context.Context, namespace, clientID, topic, ke s.mu.Unlock() evt := EphemeralEvent{ - Type: "state", - Kind: EphemeralEventSet, + Type: EphemeralEventSet, + Topic: topic, Key: key, ClientID: clientID, Payload: payloadCopy, @@ -225,14 +229,60 @@ func (s *EphemeralStore) Clear(ctx context.Context, namespace, clientID, topic, s.mu.Unlock() return s.publishEvent(ctx, namespace, topic, EphemeralEvent{ - Type: "state", - Kind: EphemeralEventClear, + Type: EphemeralEventClear, + Topic: topic, Key: key, ClientID: clientID, Reason: "explicit", }) } +// EphemeralListEntry is one live entry returned by List — the reconnect +// catch-up shape for the ephemeral_state_list host fn. ExpiresInMs is relative +// (remaining TTL) so callers don't need a synchronized clock. +type EphemeralListEntry struct { + Key string `json:"key"` + ClientID string `json:"client_id"` + Payload []byte `json:"payload,omitempty"` + ExpiresInMs int64 `json:"expires_in_ms"` +} + +// List returns the live (non-expired) entries on a (namespace, topic), sorted +// by key for deterministic output. The reconnect catch-up path (bugboard #710 +// acceptance): a client that just (re)subscribed reads the current state once, +// then tracks the ephemeral.set/ephemeral.clear event stream. Read-only — no +// ownership requirement, no WS client needed. +func (s *EphemeralStore) List(namespace, topic string) []EphemeralListEntry { + now := s.now() + + s.mu.Lock() + entries := make([]EphemeralListEntry, 0) + for sk, entry := range s.values { + if sk.namespace != namespace || sk.topic != topic { + continue + } + if !now.Before(entry.expiresAt) { + // now >= expiresAt: hide it. Intentionally one tick stricter than + // sweepExpired (which removes only when now.After(expiresAt)) — a + // reconnect catch-up must never surface state that is at/past its + // deadline, even if the backstop sweeper hasn't run yet. + continue + } + payloadCopy := make([]byte, len(entry.payload)) + copy(payloadCopy, entry.payload) + entries = append(entries, EphemeralListEntry{ + Key: entry.key, + ClientID: entry.clientID, + Payload: payloadCopy, + ExpiresInMs: entry.expiresAt.Sub(now).Milliseconds(), + }) + } + s.mu.Unlock() + + sort.Slice(entries, func(i, j int) bool { return entries[i].Key < entries[j].Key }) + return entries +} + // ClearClient removes every entry owned by clientID and publishes a clear // event for each (reason "disconnect"). Called from the WS disconnect hook — // the primary, zero-lag cleanup path. Safe to call for an unknown client. @@ -261,8 +311,8 @@ func (s *EphemeralStore) clearClientWithReason(ctx context.Context, clientID, re for _, entry := range toClear { _ = s.publishEvent(ctx, entry.namespace, entry.topic, EphemeralEvent{ - Type: "state", - Kind: EphemeralEventClear, + Type: EphemeralEventClear, + Topic: entry.topic, Key: entry.key, ClientID: clientID, Reason: reason, @@ -292,7 +342,7 @@ func (s *EphemeralStore) publishEvent(ctx context.Context, namespace, topic stri return fmt.Errorf("ephemeral state: marshal event: %w", err) } if err := s.publish(ctx, namespace, topic, data); err != nil { - return fmt.Errorf("ephemeral state: publish %s event: %w", evt.Kind, err) + return fmt.Errorf("ephemeral state: publish %s event: %w", evt.Type, err) } return nil } @@ -335,8 +385,8 @@ func (s *EphemeralStore) sweepExpired(ctx context.Context) { for _, entry := range expired { _ = s.publishEvent(ctx, entry.namespace, entry.topic, EphemeralEvent{ - Type: "state", - Kind: EphemeralEventClear, + Type: EphemeralEventClear, + Topic: entry.topic, Key: entry.key, ClientID: entry.clientID, Reason: "expired", diff --git a/core/pkg/serverless/ephemeral_state_test.go b/core/pkg/serverless/ephemeral_state_test.go index ba6bb98..de119a5 100644 --- a/core/pkg/serverless/ephemeral_state_test.go +++ b/core/pkg/serverless/ephemeral_state_test.go @@ -40,12 +40,12 @@ func (c *capturePublisher) snapshot() []capturedEvent { return out } -func (c *capturePublisher) countKind(kind EphemeralEventKind) int { +func (c *capturePublisher) countKind(eventType string) int { c.mu.Lock() defer c.mu.Unlock() n := 0 for _, e := range c.events { - if e.event.Kind == kind { + if e.event.Type == eventType { n++ } } @@ -114,7 +114,7 @@ func TestEphemeralStore_SetThenDisconnect(t *testing.T) { t.Errorf("disconnect clear events = %d, want 2", got) } for _, e := range pub.snapshot() { - if e.event.Kind == EphemeralEventClear && e.event.Reason != "disconnect" { + if e.event.Type == EphemeralEventClear && e.event.Reason != "disconnect" { t.Errorf("clear reason = %q, want disconnect", e.event.Reason) } } @@ -149,7 +149,7 @@ func TestEphemeralStore_TTLExpiry(t *testing.T) { // A clear event with reason=expired must have been published. foundExpired := false for _, e := range pub.snapshot() { - if e.event.Kind == EphemeralEventClear && e.event.Reason == "expired" { + if e.event.Type == EphemeralEventClear && e.event.Reason == "expired" { foundExpired = true } } @@ -293,3 +293,130 @@ func TestEphemeralStore_OwnershipTransfer(t *testing.T) { t.Errorf("new owner's disconnect did not clear, count=%d", s.keyCountForTest()) } } + +// TestEphemeralStore_wireFormatContract pins the EXACT JSON wire shape of the +// synthetic events — the `_orama` control-frame contract agreed with app teams +// on bugboard #710 (#458/#505/#849/#901). Client sub-routers dispatch on the +// `_orama` discriminator; renaming any of these fields is a breaking protocol +// change and must fail this test. +func TestEphemeralStore_wireFormatContract(t *testing.T) { + type raw struct { + Orama string `json:"_orama"` + Topic string `json:"topic"` + Key string `json:"key"` + ClientID string `json:"client_id"` + Payload []byte `json:"payload"` + Reason string `json:"reason"` + } + var got []raw + pub := func(_ context.Context, _, _ string, data []byte) error { + var r raw + if err := json.Unmarshal(data, &r); err != nil { + return err + } + got = append(got, r) + return nil + } + s := newTestStore(pub) + ctx := context.Background() + + if err := s.Set(ctx, "ns1", "client-A", "typing:room1", "user-7", []byte("blob"), 0); err != nil { + t.Fatalf("Set: %v", err) + } + s.ClearClient(ctx, "client-A") + + if len(got) != 2 { + t.Fatalf("expected 2 events (set + disconnect clear), got %d", len(got)) + } + set, clear := got[0], got[1] + if set.Orama != "ephemeral.set" { + t.Errorf(`set _orama = %q, want "ephemeral.set"`, set.Orama) + } + if set.Topic != "typing:room1" || set.Key != "user-7" || set.ClientID != "client-A" { + t.Errorf("set event fields wrong: %+v", set) + } + if string(set.Payload) != "blob" { + t.Errorf("set payload = %q, want blob", set.Payload) + } + if clear.Orama != "ephemeral.clear" { + t.Errorf(`clear _orama = %q, want "ephemeral.clear"`, clear.Orama) + } + if clear.Topic != "typing:room1" || clear.Key != "user-7" || clear.Reason != "disconnect" { + t.Errorf("clear event fields wrong: %+v", clear) + } +} + +func TestEphemeralStoreList_returnsLiveEntriesSorted(t *testing.T) { + s := newTestStore(nil) + ctx := context.Background() + + if err := s.Set(ctx, "ns1", "client-B", "presence:room1", "zeta", []byte("z"), 0); err != nil { + t.Fatalf("Set zeta: %v", err) + } + if err := s.Set(ctx, "ns1", "client-A", "presence:room1", "alpha", []byte("a"), 0); err != nil { + t.Fatalf("Set alpha: %v", err) + } + + entries := s.List("ns1", "presence:room1") + if len(entries) != 2 { + t.Fatalf("List returned %d entries, want 2", len(entries)) + } + if entries[0].Key != "alpha" || entries[1].Key != "zeta" { + t.Errorf("entries not sorted by key: %q, %q", entries[0].Key, entries[1].Key) + } + if entries[0].ClientID != "client-A" || string(entries[0].Payload) != "a" { + t.Errorf("entry fields wrong: %+v", entries[0]) + } + if entries[0].ExpiresInMs <= 0 { + t.Errorf("ExpiresInMs must be positive for a live entry, got %d", entries[0].ExpiresInMs) + } +} + +func TestEphemeralStoreList_excludesExpiredAndOtherScopes(t *testing.T) { + s := newTestStore(nil) + ctx := context.Background() + base := time.Now() + s.now = func() time.Time { return base } + + if err := s.Set(ctx, "ns1", "c", "t", "live", []byte("p"), 60_000); err != nil { + t.Fatalf("Set live: %v", err) + } + if err := s.Set(ctx, "ns1", "c", "t", "dying", []byte("p"), 1000); err != nil { + t.Fatalf("Set dying: %v", err) + } + if err := s.Set(ctx, "ns2", "c", "t", "other-ns", []byte("p"), 60_000); err != nil { + t.Fatalf("Set other-ns: %v", err) + } + if err := s.Set(ctx, "ns1", "c", "t2", "other-topic", []byte("p"), 60_000); err != nil { + t.Fatalf("Set other-topic: %v", err) + } + + // Advance past "dying"'s TTL but do NOT sweep — List must hide it anyway. + s.now = func() time.Time { return base.Add(2 * time.Second) } + + entries := s.List("ns1", "t") + if len(entries) != 1 || entries[0].Key != "live" { + t.Fatalf("List = %+v, want exactly the single live ns1/t entry", entries) + } +} + +func TestEphemeralStoreList_emptyTopicReturnsEmpty(t *testing.T) { + s := newTestStore(nil) + if entries := s.List("ns1", "nothing-here"); len(entries) != 0 { + t.Errorf("List on empty topic = %+v, want empty", entries) + } +} + +func TestEphemeralStoreList_snapshotIsDefensiveCopy(t *testing.T) { + s := newTestStore(nil) + ctx := context.Background() + if err := s.Set(ctx, "ns1", "c", "t", "k", []byte("orig"), 0); err != nil { + t.Fatalf("Set: %v", err) + } + entries := s.List("ns1", "t") + entries[0].Payload[0] = 'X' + fresh := s.List("ns1", "t") + if string(fresh[0].Payload) != "orig" { + t.Error("List payload is not a defensive copy; store was mutated") + } +} diff --git a/core/pkg/serverless/hostfuncs_test.go b/core/pkg/serverless/hostfuncs_test.go index 9c13fed..bebe2ef 100644 --- a/core/pkg/serverless/hostfuncs_test.go +++ b/core/pkg/serverless/hostfuncs_test.go @@ -146,6 +146,10 @@ func (m *mockHostServices) EphemeralStateClear(ctx context.Context, topic, key s return nil } +func (m *mockHostServices) EphemeralStateList(ctx context.Context, topic string) ([]byte, error) { + return []byte(`{"entries":[]}`), nil +} + func (m *mockHostServices) WSSend(ctx context.Context, clientID string, data []byte) error { return nil } diff --git a/core/pkg/serverless/hostfunctions/pubsub.go b/core/pkg/serverless/hostfunctions/pubsub.go index dd05d6b..a312381 100644 --- a/core/pkg/serverless/hostfunctions/pubsub.go +++ b/core/pkg/serverless/hostfunctions/pubsub.go @@ -220,6 +220,34 @@ func (h *HostFunctions) EphemeralStateClear(ctx context.Context, topic, key stri return nil } +// ephemeralListEnvelope is the JSON shape returned by EphemeralStateList — +// an object (not a bare array) so fields can be added without breaking +// existing WASM callers. +type ephemeralListEnvelope struct { + Entries []serverless.EphemeralListEntry `json:"entries"` +} + +// EphemeralStateList returns the live ephemeral entries on a topic in the +// invocation's namespace (bugboard #710 reconnect catch-up). Read-only: no +// WS client required, so HTTP-invoked functions can serve snapshots too. +func (h *HostFunctions) EphemeralStateList(ctx context.Context, topic string) ([]byte, error) { + if h.ephemeralStore == nil { + return nil, &serverless.HostFunctionError{Function: "ephemeral_state_list", Cause: fmt.Errorf("ephemeral state not available on this gateway")} + } + if topic == "" { + return nil, &serverless.HostFunctionError{Function: "ephemeral_state_list", Cause: fmt.Errorf("topic is required")} + } + cur := h.currentInvocationContext(ctx) + if cur == nil { + return nil, &serverless.HostFunctionError{Function: "ephemeral_state_list", Cause: fmt.Errorf("no invocation context")} + } + out, err := json.Marshal(ephemeralListEnvelope{Entries: h.ephemeralStore.List(cur.Namespace, topic)}) + if err != nil { + return nil, &serverless.HostFunctionError{Function: "ephemeral_state_list", Cause: fmt.Errorf("marshal entries: %w", err)} + } + return out, nil +} + // WSSend sends data to a specific WebSocket client. func (h *HostFunctions) WSSend(ctx context.Context, clientID string, data []byte) error { if h.wsManager == nil { diff --git a/core/pkg/serverless/mocks_test.go b/core/pkg/serverless/mocks_test.go index 0240dc0..7cb993a 100644 --- a/core/pkg/serverless/mocks_test.go +++ b/core/pkg/serverless/mocks_test.go @@ -259,6 +259,10 @@ func (m *MockHostServices) EphemeralStateClear(ctx context.Context, topic, key s return nil } +func (m *MockHostServices) EphemeralStateList(ctx context.Context, topic string) ([]byte, error) { + return []byte(`{"entries":[]}`), nil +} + func (m *MockHostServices) WSSend(ctx context.Context, clientID string, data []byte) error { return nil } diff --git a/core/pkg/serverless/types.go b/core/pkg/serverless/types.go index 19c9455..f9d99a8 100644 --- a/core/pkg/serverless/types.go +++ b/core/pkg/serverless/types.go @@ -595,6 +595,14 @@ type HostServices interface { // non-owned key is a no-op. Errors only on no-WS-client / empty topic-key. EphemeralStateClear(ctx context.Context, topic, key string) error + // EphemeralStateList returns the live entries on a topic in the current + // invocation's namespace as a JSON envelope: + // {"entries":[{"key":..,"client_id":..,"payload":,"expires_in_ms":..}, …]} + // The reconnect catch-up read (bugboard #710 acceptance): unlike + // Set/Clear it does NOT require a WS client in context — any function + // invocation may read. Errors on empty topic or no invocation context. + EphemeralStateList(ctx context.Context, topic string) ([]byte, error) + // WebSocket operations (only valid in WS context) WSSend(ctx context.Context, clientID string, data []byte) error WSBroadcast(ctx context.Context, topic string, data []byte) error diff --git a/core/pkg/sniproxy/discoverer.go b/core/pkg/sniproxy/discoverer.go new file mode 100644 index 0000000..0f8d3eb --- /dev/null +++ b/core/pkg/sniproxy/discoverer.go @@ -0,0 +1,129 @@ +package sniproxy + +import ( + "strings" + "time" + + "go.uber.org/zap" +) + +// discoveryWarnInterval rate-limits the "discovery scan failed" warning so a +// persistently-unreadable namespaces directory cannot flood the journal. +const discoveryWarnInterval = 5 * time.Minute + +// StaticRoutes returns the operator-set routes parsed from the SNI router's own +// config file plus the fallback backend. The discoverer merges these with the +// auto-discovered TURN routes; static routes win on an SNI conflict. +type StaticRoutes func() (routes []Route, fallback Backend, err error) + +// TURNRouteDiscoverer periodically rescans the namespaces directory for +// per-namespace TURNS listeners, merges the discovered routes with the static +// routes from the config file (static wins on conflict), and atomically +// installs the result on the Router. +// +// A transient failure (unreadable namespaces dir, or a bad static-config read) +// logs a rate-limited warning and KEEPS the previously-installed routes — a +// filesystem hiccup must never blackhole live :443 traffic. +type TURNRouteDiscoverer struct { + cfg TURNDiscoveryConfig + static StaticRoutes + router *Router + logger *zap.Logger + + // lastWarn is only touched by the Run goroutine after the synchronous + // startup Apply, so it needs no lock. + lastWarn time.Time +} + +// NewTURNRouteDiscoverer constructs a discoverer. static reads the operator's +// config-file routes + fallback; router receives the merged Replace calls. +func NewTURNRouteDiscoverer(cfg TURNDiscoveryConfig, static StaticRoutes, router *Router, logger *zap.Logger) *TURNRouteDiscoverer { + if logger == nil { + logger = zap.NewNop() + } + return &TURNRouteDiscoverer{cfg: cfg, static: static, router: router, logger: logger} +} + +// Apply performs one scan+merge and installs the result atomically. On any +// transient error it returns the error and leaves the Router untouched so the +// caller can decide whether to fail startup (Apply) or keep stale routes (Run). +func (d *TURNRouteDiscoverer) Apply() error { + staticRoutes, fallback, err := d.static() + if err != nil { + return err + } + + discovered, err := DiscoverTURNRoutes(d.cfg, d.logger) + if err != nil { + return err + } + + merged := mergeRoutes(staticRoutes, discovered) + d.router.Replace(merged, fallback) + return nil +} + +// Run scans immediately, then every rescan interval until stop is closed. A +// failed scan keeps the current routes and logs a rate-limited warning. +func (d *TURNRouteDiscoverer) Run(stop <-chan struct{}) { + if err := d.Apply(); err != nil { + d.warn("initial TURN route discovery failed; serving config-file routes only", err) + } + + interval := d.cfg.RescanInterval + if interval <= 0 { + interval = DefaultDiscoveryRescanInterval + } + ticker := time.NewTicker(interval) + defer ticker.Stop() + + for { + select { + case <-stop: + return + case <-ticker.C: + if err := d.Apply(); err != nil { + d.warn("TURN route discovery failed; keeping current routes", err) + continue + } + } + } +} + +// warn logs at most once per discoveryWarnInterval to avoid journal flooding +// when the namespaces directory is persistently unreadable. +func (d *TURNRouteDiscoverer) warn(msg string, err error) { + now := time.Now() + if now.Sub(d.lastWarn) < discoveryWarnInterval { + return + } + d.lastWarn = now + d.logger.Warn(msg, + zap.String("namespaces_dir", d.cfg.NamespacesDir), + zap.Error(err)) +} + +// mergeRoutes combines static and discovered routes with static taking +// precedence on an SNI-match conflict. Static routes keep their original order +// and precede discovered ones, matching Router.Pick's first-match semantics. +func mergeRoutes(static, discovered []Route) []Route { + seen := make(map[string]struct{}, len(static)) + merged := make([]Route, 0, len(static)+len(discovered)) + for _, r := range static { + seen[matchKey(r.Match)] = struct{}{} + merged = append(merged, r) + } + for _, r := range discovered { + if _, conflict := seen[matchKey(r.Match)]; conflict { + continue // static wins + } + merged = append(merged, r) + } + return merged +} + +// matchKey normalizes an SNI match for conflict comparison (matching is +// case-insensitive, mirroring Router.Pick / matchSNI). +func matchKey(match string) string { + return strings.ToLower(match) +} diff --git a/core/pkg/sniproxy/discoverer_test.go b/core/pkg/sniproxy/discoverer_test.go new file mode 100644 index 0000000..277b33b --- /dev/null +++ b/core/pkg/sniproxy/discoverer_test.go @@ -0,0 +1,143 @@ +package sniproxy + +import ( + "errors" + "path/filepath" + "testing" + + "github.com/DeBrosOfficial/network/pkg/turn" +) + +// TestTURNRouteDiscoverer_staticRouteWinsMerge verifies that when a discovered +// stealth route collides with a static config route on the same SNI, the static +// route's backend is the one that ends up in the router (static wins). +func TestTURNRouteDiscoverer_staticRouteWinsMerge(t *testing.T) { + dir := t.TempDir() + const base = "example.com" + writeTURNConfig(t, dir, "anchat", "node-1", "0.0.0.0:5349") + + stealthHost := turn.StealthHostForNamespace("anchat", base) + fallback := Backend{Name: "caddy", Network: "tcp", Addr: "127.0.0.1:8443"} + + // Static config pins the very same stealth host to a DIFFERENT backend. + static := func() ([]Route, Backend, error) { + return []Route{ + {Match: stealthHost, Backend: Backend{Name: "static-override", Network: "tcp", Addr: "127.0.0.1:9999"}}, + }, fallback, nil + } + + router := NewRouter(Backend{}) + d := NewTURNRouteDiscoverer(TURNDiscoveryConfig{NamespacesDir: dir, BaseDomain: base}, static, router, nil) + if err := d.Apply(); err != nil { + t.Fatalf("Apply failed: %v", err) + } + + // Pick must return the static backend, not the discovered one. + got := router.Pick(stealthHost) + if got.Addr != "127.0.0.1:9999" { + t.Errorf("static route should win: got backend %q, want 127.0.0.1:9999", got.Addr) + } + + // The non-conflicting discovered alias must still be present. + alias := router.Pick("turn.ns-anchat." + base) + if alias.Addr != "127.0.0.1:5349" { + t.Errorf("discovered alias route missing/wrong: got %q", alias.Addr) + } + + // Fallback preserved from static source. + if router.Fallback().Addr != "127.0.0.1:8443" { + t.Errorf("fallback not preserved: got %q", router.Fallback().Addr) + } +} + +// TestTURNRouteDiscoverer_transientErrorKeepsPreviousRoutes verifies that once +// routes are installed, a subsequent Apply whose scan fails (namespaces dir +// removed) returns an error and leaves the previously-installed routes intact — +// a transient filesystem error must never blackhole :443. +func TestTURNRouteDiscoverer_transientErrorKeepsPreviousRoutes(t *testing.T) { + parent := t.TempDir() + nsDir := filepath.Join(parent, "namespaces") + const base = "example.com" + writeTURNConfig(t, nsDir, "anchat", "node-1", "0.0.0.0:5349") + + fallback := Backend{Name: "caddy", Network: "tcp", Addr: "127.0.0.1:8443"} + static := func() ([]Route, Backend, error) { return nil, fallback, nil } + + router := NewRouter(Backend{}) + d := NewTURNRouteDiscoverer(TURNDiscoveryConfig{NamespacesDir: nsDir, BaseDomain: base}, static, router, nil) + + // First Apply succeeds and installs the anchat routes. + if err := d.Apply(); err != nil { + t.Fatalf("first Apply failed: %v", err) + } + before := len(router.Routes()) + if before != 2 { + t.Fatalf("expected 2 routes after first apply, got %d", before) + } + + // Make the namespaces dir unreadable by pointing the discoverer at a now- + // removed path (simulate transient read failure). + d.cfg.NamespacesDir = filepath.Join(parent, "gone") + + err := d.Apply() + if err == nil { + t.Fatalf("expected Apply to error on missing namespaces dir") + } + + // Routes must be unchanged — the failed scan kept the previous table. + after := router.Routes() + if len(after) != before { + t.Errorf("routes changed on transient error: had %d, now %d", before, len(after)) + } + stealthHost := turn.StealthHostForNamespace("anchat", base) + if router.Pick(stealthHost).Addr != "127.0.0.1:5349" { + t.Errorf("previously-installed stealth route lost after transient error") + } +} + +// TestTURNRouteDiscoverer_staticSourceErrorKeepsRoutes verifies a failing static +// source (e.g. a bad config-file edit) also leaves the router untouched. +func TestTURNRouteDiscoverer_staticSourceErrorKeepsRoutes(t *testing.T) { + dir := t.TempDir() + const base = "example.com" + writeTURNConfig(t, dir, "anchat", "node-1", "0.0.0.0:5349") + + fallback := Backend{Name: "caddy", Network: "tcp", Addr: "127.0.0.1:8443"} + good := func() ([]Route, Backend, error) { return nil, fallback, nil } + + router := NewRouter(Backend{}) + d := NewTURNRouteDiscoverer(TURNDiscoveryConfig{NamespacesDir: dir, BaseDomain: base}, good, router, nil) + if err := d.Apply(); err != nil { + t.Fatalf("first Apply failed: %v", err) + } + before := len(router.Routes()) + + // Swap in a static source that errors (simulates a malformed config file). + d.static = func() ([]Route, Backend, error) { return nil, Backend{}, errors.New("bad config") } + if err := d.Apply(); err == nil { + t.Fatalf("expected Apply to error on static source failure") + } + if len(router.Routes()) != before { + t.Errorf("routes changed on static-source error: had %d, now %d", before, len(router.Routes())) + } +} + +// TestMergeRoutes_staticPrecedesDiscovered checks first-match ordering: static +// routes precede discovered ones in the merged slice. +func TestMergeRoutes_staticPrecedesDiscovered(t *testing.T) { + static := []Route{{Match: "a.example.com", Backend: Backend{Addr: "127.0.0.1:1"}}} + discovered := []Route{ + {Match: "a.example.com", Backend: Backend{Addr: "127.0.0.1:2"}}, // conflict, dropped + {Match: "b.example.com", Backend: Backend{Addr: "127.0.0.1:3"}}, + } + merged := mergeRoutes(static, discovered) + if len(merged) != 2 { + t.Fatalf("expected 2 merged routes (1 static + 1 non-conflicting), got %d: %+v", len(merged), merged) + } + if merged[0].Match != "a.example.com" || merged[0].Backend.Addr != "127.0.0.1:1" { + t.Errorf("static route should be first and unchanged: %+v", merged[0]) + } + if merged[1].Match != "b.example.com" { + t.Errorf("non-conflicting discovered route missing: %+v", merged) + } +} diff --git a/core/pkg/sniproxy/discovery.go b/core/pkg/sniproxy/discovery.go new file mode 100644 index 0000000..a3505ed --- /dev/null +++ b/core/pkg/sniproxy/discovery.go @@ -0,0 +1,185 @@ +package sniproxy + +import ( + "fmt" + "net" + "os" + "path/filepath" + "sort" + "strings" + "time" + + "github.com/DeBrosOfficial/network/pkg/turn" + "go.uber.org/zap" + "gopkg.in/yaml.v3" +) + +// DefaultDiscoveryRescanInterval is the default cadence at which the TURN route +// discoverer rescans the namespaces directory. SNI route changes (a namespace +// gaining or losing its TURNS listener) are infrequent, so 30s of detection +// latency is acceptable and keeps load on the filesystem negligible. +const DefaultDiscoveryRescanInterval = 30 * time.Second + +// turnConfigGlob matches the per-node TURN config files the namespace spawner +// writes under "//configs/turn-.yaml". +const turnConfigGlob = "configs/turn-*.yaml" + +// stealthBackendNamePrefix labels discovered TURN backends in logs/metrics. +const stealthBackendNamePrefix = "turn-stealth-" + +// turnBackendStealthHostLabel and turnBackendNamespaceLabel are the two SNI +// hostname shapes the router forwards to a namespace's TURNS listener. +// - the bland hashed host from turn.StealthHostForNamespace (DPI-resistant) +// - a human-readable "turn.ns-." alias (operator UX) + +// TURNDiscoveryConfig configures the namespaces scan that derives per-namespace +// stealth-TURN routes. All fields are required; a zero RescanInterval selects +// DefaultDiscoveryRescanInterval. +type TURNDiscoveryConfig struct { + // NamespacesDir is the directory holding one subdirectory per namespace, + // each containing a "configs/turn-*.yaml" written by the namespace spawner + // (e.g. "/opt/orama/.orama/data/namespaces"). + NamespacesDir string `yaml:"namespaces_dir"` + + // BaseDomain is the cluster's base domain (e.g. "orama-devnet.network"), + // used to derive the stealth and "turn.ns-*" SNI hostnames. + BaseDomain string `yaml:"base_domain"` + + // RescanInterval is how often the namespaces directory is rescanned. Zero + // selects DefaultDiscoveryRescanInterval. + RescanInterval time.Duration `yaml:"rescan_interval"` +} + +// Validate reports configuration errors. It does not touch the filesystem; a +// missing NamespacesDir at scan time is a transient error handled by the +// discoverer (previous routes are kept), not a config error. +func (c *TURNDiscoveryConfig) Validate() []string { + var errs []string + if c.NamespacesDir == "" { + errs = append(errs, "turn_discovery.namespaces_dir: required") + } + if c.BaseDomain == "" { + errs = append(errs, "turn_discovery.base_domain: required") + } + return errs +} + +// DiscoverTURNRoutes scans cfg.NamespacesDir for per-namespace TURN configs and +// returns two routes per namespace that exposes a TURNS listener: +// +// - turn.StealthHostForNamespace(namespace, baseDomain) -> 127.0.0.1: +// - "turn.ns-." -> 127.0.0.1: +// +// Namespaces whose TURN config has an empty turns_listen_addr (TURNS disabled) +// are skipped. A turn-*.yaml that cannot be read or parsed is skipped with a +// per-file warning, but the scan continues for the rest — one bad file must not +// hide every other namespace's routes. +// +// A failure to read the namespaces directory itself returns an error so callers +// can keep the previously-installed routes rather than wiping them on a +// transient filesystem error. +func DiscoverTURNRoutes(cfg TURNDiscoveryConfig, logger *zap.Logger) ([]Route, error) { + if logger == nil { + logger = zap.NewNop() + } + + entries, err := os.ReadDir(cfg.NamespacesDir) + if err != nil { + return nil, fmt.Errorf("read namespaces dir %s: %w", cfg.NamespacesDir, err) + } + + var routes []Route + for _, entry := range entries { + if !entry.IsDir() { + continue + } + nsRoutes := discoverNamespaceRoutes(cfg, entry.Name(), logger) + routes = append(routes, nsRoutes...) + } + + // Deterministic order keeps Router.Replace idempotent and tests stable. + sort.Slice(routes, func(i, j int) bool { return routes[i].Match < routes[j].Match }) + return routes, nil +} + +// discoverNamespaceRoutes resolves the stealth + alias routes for a single +// namespace directory. Returns nil when the namespace has no TURNS listener or +// its config is unreadable/unparseable (logged, not fatal). +func discoverNamespaceRoutes(cfg TURNDiscoveryConfig, nsDir string, logger *zap.Logger) []Route { + glob := filepath.Join(cfg.NamespacesDir, nsDir, turnConfigGlob) + matches, err := filepath.Glob(glob) + if err != nil { + // Glob only errors on a malformed pattern, which turnConfigGlob is not; + // guard anyway so a future edit can't silently swallow it. + logger.Warn("turn-config glob failed", + zap.String("namespace_dir", nsDir), zap.Error(err)) + return nil + } + + for _, configPath := range matches { + namespace, tlsPort, ok := parseTURNConfig(configPath, logger) + if !ok { + continue + } + backend := Backend{ + Name: stealthBackendNamePrefix + namespace, + Network: "tcp", + Addr: net.JoinHostPort("127.0.0.1", tlsPort), + } + return []Route{ + {Match: turn.StealthHostForNamespace(namespace, cfg.BaseDomain), Backend: backend}, + {Match: fmt.Sprintf("turn.ns-%s.%s", namespace, cfg.BaseDomain), Backend: backend}, + } + } + return nil +} + +// parseTURNConfig reads a turn-*.yaml and returns its namespace and TURNS port. +// ok is false (with a warning) when the file is unreadable/unparseable, when it +// names no namespace, or when TURNS is disabled (empty turns_listen_addr). +func parseTURNConfig(path string, logger *zap.Logger) (namespace, tlsPort string, ok bool) { + data, err := os.ReadFile(path) + if err != nil { + logger.Warn("read turn config failed", zap.String("path", path), zap.Error(err)) + return "", "", false + } + + var c turn.Config + if err := yaml.Unmarshal(data, &c); err != nil { + logger.Warn("parse turn config failed", zap.String("path", path), zap.Error(err)) + return "", "", false + } + + if c.Namespace == "" { + logger.Warn("turn config has empty namespace", zap.String("path", path)) + return "", "", false + } + if strings.TrimSpace(c.TURNSListenAddr) == "" { + // TURNS disabled for this namespace — no stealth route, not an error. + return "", "", false + } + + port, err := portFromListenAddr(c.TURNSListenAddr) + if err != nil { + logger.Warn("turn config has invalid turns_listen_addr", + zap.String("path", path), + zap.String("turns_listen_addr", c.TURNSListenAddr), + zap.Error(err)) + return "", "", false + } + return c.Namespace, port, true +} + +// portFromListenAddr extracts the port from a "host:port" TURNS listen address +// (e.g. "0.0.0.0:5349" -> "5349"). The router always dials 127.0.0.1, so only +// the port is needed. +func portFromListenAddr(addr string) (string, error) { + _, port, err := net.SplitHostPort(addr) + if err != nil { + return "", fmt.Errorf("split host:port: %w", err) + } + if port == "" { + return "", fmt.Errorf("empty port in %q", addr) + } + return port, nil +} diff --git a/core/pkg/sniproxy/discovery_test.go b/core/pkg/sniproxy/discovery_test.go new file mode 100644 index 0000000..f7819d4 --- /dev/null +++ b/core/pkg/sniproxy/discovery_test.go @@ -0,0 +1,167 @@ +package sniproxy + +import ( + "os" + "path/filepath" + "testing" + + "github.com/DeBrosOfficial/network/pkg/turn" +) + +// writeTURNConfig is a test helper that lays out the on-disk shape the namespace +// spawner produces: //configs/turn-.yaml. +func writeTURNConfig(t *testing.T, namespacesDir, namespace, nodeID, turnsAddr string) { + t.Helper() + configDir := filepath.Join(namespacesDir, namespace, "configs") + if err := os.MkdirAll(configDir, 0755); err != nil { + t.Fatalf("mkdir configs failed: %v", err) + } + content := "namespace: \"" + namespace + "\"\n" + content += "turns_listen_addr: \"" + turnsAddr + "\"\n" + path := filepath.Join(configDir, "turn-"+nodeID+".yaml") + if err := os.WriteFile(path, []byte(content), 0644); err != nil { + t.Fatalf("write turn config failed: %v", err) + } +} + +// TestDiscoverTURNRoutes_scansFixtureDir verifies that two namespaces each with +// a TURNS listener yield two routes apiece (stealth host + turn.ns-* alias), +// while a namespace with an empty turns_listen_addr is skipped entirely. +func TestDiscoverTURNRoutes_scansFixtureDir(t *testing.T) { + dir := t.TempDir() + const base = "orama-devnet.network" + + writeTURNConfig(t, dir, "anchat", "node-1", "0.0.0.0:5349") + writeTURNConfig(t, dir, "video", "node-1", "0.0.0.0:5350") + // TURNS disabled — must produce no routes. + writeTURNConfig(t, dir, "noturns", "node-1", "") + + routes, err := DiscoverTURNRoutes(TURNDiscoveryConfig{ + NamespacesDir: dir, + BaseDomain: base, + }, nil) + if err != nil { + t.Fatalf("DiscoverTURNRoutes failed: %v", err) + } + + // 2 namespaces with TURNS × 2 routes each = 4. + if len(routes) != 4 { + t.Fatalf("expected 4 routes, got %d: %+v", len(routes), routes) + } + + got := map[string]string{} + for _, r := range routes { + got[r.Match] = r.Backend.Addr + } + + // anchat: backend port 5349, stealth host + alias. + anchatStealth := turn.StealthHostForNamespace("anchat", base) + if got[anchatStealth] != "127.0.0.1:5349" { + t.Errorf("anchat stealth route missing/wrong: %q -> %q", anchatStealth, got[anchatStealth]) + } + if got["turn.ns-anchat."+base] != "127.0.0.1:5349" { + t.Errorf("anchat alias route missing/wrong: got %q", got["turn.ns-anchat."+base]) + } + + // video: backend port 5350. + videoStealth := turn.StealthHostForNamespace("video", base) + if got[videoStealth] != "127.0.0.1:5350" { + t.Errorf("video stealth route missing/wrong: %q -> %q", videoStealth, got[videoStealth]) + } + if got["turn.ns-video."+base] != "127.0.0.1:5350" { + t.Errorf("video alias route missing/wrong: got %q", got["turn.ns-video."+base]) + } + + // The disabled namespace must not appear under any of its hostnames. + if _, ok := got["turn.ns-noturns."+base]; ok { + t.Errorf("noturns namespace should be skipped (empty turns_listen_addr)") + } +} + +// TestDiscoverTURNRoutes_emptyTURNSAddrSkipped is a focused check that a single +// namespace with an empty turns_listen_addr produces zero routes (no error). +func TestDiscoverTURNRoutes_emptyTURNSAddrSkipped(t *testing.T) { + dir := t.TempDir() + writeTURNConfig(t, dir, "noturns", "node-1", "") + + routes, err := DiscoverTURNRoutes(TURNDiscoveryConfig{ + NamespacesDir: dir, + BaseDomain: "example.com", + }, nil) + if err != nil { + t.Fatalf("DiscoverTURNRoutes failed: %v", err) + } + if len(routes) != 0 { + t.Errorf("expected 0 routes for TURNS-disabled namespace, got %d: %+v", len(routes), routes) + } +} + +// TestDiscoverTURNRoutes_unreadableDirReturnsError verifies a missing namespaces +// directory is a transient error (so callers keep previous routes), not a silent +// empty result. +func TestDiscoverTURNRoutes_unreadableDirReturnsError(t *testing.T) { + missing := filepath.Join(t.TempDir(), "does-not-exist") + + routes, err := DiscoverTURNRoutes(TURNDiscoveryConfig{ + NamespacesDir: missing, + BaseDomain: "example.com", + }, nil) + if err == nil { + t.Fatalf("expected an error for unreadable namespaces dir, got nil (routes=%+v)", routes) + } + if routes != nil { + t.Errorf("expected nil routes on error, got %+v", routes) + } +} + +// TestDiscoverTURNRoutes_malformedFileSkipped verifies one unparseable +// turn-*.yaml is skipped while a sibling valid namespace still yields routes +// (one bad file must not hide the rest). +func TestDiscoverTURNRoutes_malformedFileSkipped(t *testing.T) { + dir := t.TempDir() + const base = "example.com" + + writeTURNConfig(t, dir, "good", "node-1", "0.0.0.0:5349") + + badDir := filepath.Join(dir, "bad", "configs") + if err := os.MkdirAll(badDir, 0755); err != nil { + t.Fatalf("mkdir bad configs failed: %v", err) + } + if err := os.WriteFile(filepath.Join(badDir, "turn-node-1.yaml"), []byte(":\n not: [valid"), 0644); err != nil { + t.Fatalf("write malformed config failed: %v", err) + } + + routes, err := DiscoverTURNRoutes(TURNDiscoveryConfig{ + NamespacesDir: dir, + BaseDomain: base, + }, nil) + if err != nil { + t.Fatalf("DiscoverTURNRoutes failed: %v", err) + } + if len(routes) != 2 { + t.Fatalf("expected 2 routes from the good namespace, got %d: %+v", len(routes), routes) + } + goodStealth := turn.StealthHostForNamespace("good", base) + found := false + for _, r := range routes { + if r.Match == goodStealth { + found = true + } + } + if !found { + t.Errorf("good namespace stealth route missing despite malformed sibling") + } +} + +// TestTURNDiscoveryConfig_Validate covers the required-field validation. +func TestTURNDiscoveryConfig_Validate(t *testing.T) { + if errs := (&TURNDiscoveryConfig{NamespacesDir: "/x", BaseDomain: "example.com"}).Validate(); len(errs) != 0 { + t.Errorf("valid config reported errors: %v", errs) + } + if errs := (&TURNDiscoveryConfig{BaseDomain: "example.com"}).Validate(); len(errs) == 0 { + t.Errorf("missing namespaces_dir should be invalid") + } + if errs := (&TURNDiscoveryConfig{NamespacesDir: "/x"}).Validate(); len(errs) == 0 { + t.Errorf("missing base_domain should be invalid") + } +} diff --git a/core/pkg/turn/config.go b/core/pkg/turn/config.go index 0b9bb49..d54bf0f 100644 --- a/core/pkg/turn/config.go +++ b/core/pkg/turn/config.go @@ -36,6 +36,27 @@ type Config struct { // Namespace this TURN instance belongs to Namespace string `yaml:"namespace"` + + // StealthDomain is the neutral, CDN-bland SNI hostname this server also + // answers TURNS for (e.g. "cdn-a1b2c3d4e5f6.orama-devnet.network"). + // + // The stealth endpoint is an SNI-router passthrough, NOT a separate TURN + // server: a router on :443 reads only the TLS ClientHello SNI and forwards + // the raw bytes for this hostname to this same TURNS listener. TLS is still + // terminated here, by this TURN server, which therefore presents two certs + // (the primary TURN domain and StealthDomain) selected by ClientHello SNI. + // When empty, the stealth endpoint is disabled and behavior is unchanged. + StealthDomain string `yaml:"stealth_domain,omitempty"` + + // TLSStealthCertPath is the path to the TLS certificate PEM file presented + // for StealthDomain. The SNI router only forwards bytes; this TURN server + // terminates the TLS handshake, so it needs the stealth domain's cert here. + TLSStealthCertPath string `yaml:"tls_stealth_cert_path,omitempty"` + + // TLSStealthKeyPath is the path to the TLS private key PEM file for the + // StealthDomain certificate (TURN terminates TLS for the router-forwarded + // stealth connections). + TLSStealthKeyPath string `yaml:"tls_stealth_key_path,omitempty"` } // Validate checks the TURN configuration for errors diff --git a/core/pkg/turn/server.go b/core/pkg/turn/server.go index d80f361..f6e10c7 100644 --- a/core/pkg/turn/server.go +++ b/core/pkg/turn/server.go @@ -15,6 +15,11 @@ import ( "go.uber.org/zap" ) +// stealthConfigFieldCount is the number of stealth TLS config fields that must +// be set together (StealthDomain, TLSStealthCertPath, TLSStealthKeyPath). Any +// other count is a partial config and fails server startup. +const stealthConfigFieldCount = 3 + // Server wraps a Pion TURN server with namespace-scoped HMAC-SHA1 authentication. type Server struct { config *Config @@ -24,8 +29,9 @@ type Server struct { tcpListener net.Listener // Plain TCP listener on primary port (3478) tlsListener net.Listener // TLS TCP listener for TURNS (port 5349) - certReloader *certReloader // hot-reloads the TURNS cert; nil when TURNS disabled - certStop chan struct{} // closed to stop the cert-reload watcher goroutine + certReloader *certReloader // hot-reloads the primary TURNS cert; nil when TURNS disabled + stealthCertReloader *certReloader // hot-reloads the stealth-SNI cert; nil when stealth disabled + certStop chan struct{} // closed to stop the cert-reload watcher goroutine(s) } // NewServer creates and starts a TURN server. @@ -94,8 +100,18 @@ func NewServer(cfg *Config, logger *zap.Logger) (*Server, error) { s.closeListeners() return nil, fmt.Errorf("failed to load TLS cert/key: %w", err) } + s.certReloader = reloader + + // Stealth SNI: when configured, terminate TLS for a second (neutral) + // hostname using its own hot-reloading cert. The SNI router forwards the + // raw stealth-domain bytes to this listener; selection is by ServerName. + if err := s.loadStealthCertReloader(cfg); err != nil { + s.closeListeners() + return nil, err + } + tlsConfig := &tls.Config{ - GetCertificate: reloader.GetCertificate, + GetCertificate: newGetCertificate(cfg.StealthDomain, reloader, s.stealthCertReloader), MinVersion: tls.VersionTLS12, } tlsListener, err := tls.Listen("tcp", cfg.TURNSListenAddr, tlsConfig) @@ -104,9 +120,11 @@ func NewServer(cfg *Config, logger *zap.Logger) (*Server, error) { return nil, fmt.Errorf("failed to listen on %s: %w", cfg.TURNSListenAddr, err) } s.tlsListener = tlsListener - s.certReloader = reloader s.certStop = make(chan struct{}) go reloader.watch(turnCertReloadInterval, s.certStop) + if s.stealthCertReloader != nil { + go s.stealthCertReloader.watch(turnCertReloadInterval, s.certStop) + } listenerConfigs = append(listenerConfigs, pionTurn.ListenerConfig{ Listener: tlsListener, @@ -150,6 +168,62 @@ func NewServer(cfg *Config, logger *zap.Logger) (*Server, error) { return s, nil } +// loadStealthCertReloader sets up the second cert reloader used for the stealth +// SNI hostname, storing it on s.stealthCertReloader. The three stealth fields +// (StealthDomain, TLSStealthCertPath, TLSStealthKeyPath) are all-or-nothing: a +// partial config is an operator mistake and fails startup rather than silently +// running without the stealth endpoint. When none are set, stealth is disabled +// and the primary TLS path is byte-for-byte unchanged. +func (s *Server) loadStealthCertReloader(cfg *Config) error { + set := 0 + if cfg.StealthDomain != "" { + set++ + } + if cfg.TLSStealthCertPath != "" { + set++ + } + if cfg.TLSStealthKeyPath != "" { + set++ + } + if set == 0 { + return nil // stealth disabled + } + if set != stealthConfigFieldCount { + var missing []string + if cfg.StealthDomain == "" { + missing = append(missing, "stealth_domain") + } + if cfg.TLSStealthCertPath == "" { + missing = append(missing, "tls_stealth_cert_path") + } + if cfg.TLSStealthKeyPath == "" { + missing = append(missing, "tls_stealth_key_path") + } + return fmt.Errorf("turn: partial stealth config — set all of [stealth_domain, tls_stealth_cert_path, tls_stealth_key_path] or none; missing: %s", strings.Join(missing, ", ")) + } + + reloader, err := newCertReloader(cfg.TLSStealthCertPath, cfg.TLSStealthKeyPath, s.logger) + if err != nil { + return fmt.Errorf("failed to load stealth TLS cert/key (cert=%s, key=%s): %w", cfg.TLSStealthCertPath, cfg.TLSStealthKeyPath, err) + } + s.stealthCertReloader = reloader + return nil +} + +// newGetCertificate builds the tls.Config.GetCertificate callback. When the +// ClientHello ServerName equals stealthDomain (case-insensitively), it serves +// the stealth cert; every other case — including empty SNI and the primary TURN +// domain — serves the primary cert, preserving the pre-stealth behavior. When +// stealth is disabled (stealthReloader nil) it is exactly primary.GetCertificate. +func newGetCertificate(stealthDomain string, primary, stealth *certReloader) func(*tls.ClientHelloInfo) (*tls.Certificate, error) { + return func(hello *tls.ClientHelloInfo) (*tls.Certificate, error) { + if stealth != nil && hello != nil && strings.EqualFold(hello.ServerName, stealthDomain) { + return stealth.GetCertificate(hello) + } + return primary.GetCertificate(hello) + } +} + // authHandler validates HMAC-SHA1 credentials. // Username format: {expiry_unix}:{namespace} // Password: base64(HMAC-SHA1(shared_secret, username)) @@ -239,6 +313,8 @@ func (s *Server) closeListeners() { s.tlsListener.Close() s.tlsListener = nil } + s.certReloader = nil + s.stealthCertReloader = nil } // GenerateCredentials creates time-limited HMAC-SHA1 TURN credentials. diff --git a/core/pkg/turn/stealth.go b/core/pkg/turn/stealth.go new file mode 100644 index 0000000..20d7c26 --- /dev/null +++ b/core/pkg/turn/stealth.go @@ -0,0 +1,26 @@ +package turn + +import ( + "crypto/sha256" + "encoding/hex" + "fmt" +) + +// stealthHostHashBytes is how many bytes of the namespace digest appear in the +// stealth hostname label. 6 bytes (12 hex chars) keeps the label CDN-bland +// while making cross-namespace collisions negligible at platform scale. +const stealthHostHashBytes = 6 + +// StealthHostForNamespace derives the censorship-resistant TURNS hostname for +// a namespace: "cdn-<12-hex-of-sha256(namespace)>.". +// +// Design (feat-124): the label must NOT contain the namespace (an SNI string +// like "cdn.ns-anchat-test.…" hands DPI the exact app to block), must be +// deterministic so every component (cluster manager, namespace gateway, SNI +// router, DNS) derives the same value with no extra coordination, and must be +// unique per namespace because the SNI router maps it to that namespace's +// TURN-TLS backend. +func StealthHostForNamespace(namespace, baseDomain string) string { + sum := sha256.Sum256([]byte(namespace)) + return fmt.Sprintf("cdn-%s.%s", hex.EncodeToString(sum[:stealthHostHashBytes]), baseDomain) +} diff --git a/core/pkg/turn/stealth_server_test.go b/core/pkg/turn/stealth_server_test.go new file mode 100644 index 0000000..35866d6 --- /dev/null +++ b/core/pkg/turn/stealth_server_test.go @@ -0,0 +1,201 @@ +package turn + +import ( + "bytes" + "crypto/tls" + "path/filepath" + "strings" + "testing" + + "go.uber.org/zap" +) + +// feat-124: the stealth TURNS endpoint is an SNI-router passthrough — the TURN +// server terminates TLS for both the primary TURN domain and a neutral stealth +// domain, selecting the cert by ClientHello SNI. These pin: per-SNI selection +// (incl. empty SNI, case-insensitivity), partial-config startup failure, and +// the missing stealth-cert startup failure (no silent fallback). + +const ( + stealthTestDomain = "cdn-a1b2c3d4e5f6.orama-devnet.network" + turnTestDomain = "turn.orama-devnet.network" +) + +func writeNamedCert(t *testing.T, dir, name string) (certPath, keyPath string) { + t.Helper() + certPath = filepath.Join(dir, name+".pem") + keyPath = filepath.Join(dir, name+".key.pem") + if err := GenerateSelfSignedCert(certPath, keyPath, "127.0.0.1"); err != nil { + t.Fatalf("GenerateSelfSignedCert(%s): %v", name, err) + } + return certPath, keyPath +} + +func certLeafForSNI(t *testing.T, getCert func(*tls.ClientHelloInfo) (*tls.Certificate, error), serverName string) []byte { + t.Helper() + cert, err := getCert(&tls.ClientHelloInfo{ServerName: serverName}) + if err != nil { + t.Fatalf("GetCertificate(%q): %v", serverName, err) + } + if cert == nil || len(cert.Certificate) == 0 { + t.Fatalf("GetCertificate(%q) returned an empty certificate", serverName) + } + return cert.Certificate[0] +} + +func TestGetCertificate_stealthSNISelectsStealthCert(t *testing.T) { + dir := t.TempDir() + primaryCert, primaryKey := writeNamedCert(t, dir, "primary") + stealthCert, stealthKey := writeNamedCert(t, dir, "stealth") + + primary, err := newCertReloader(primaryCert, primaryKey, zap.NewNop()) + if err != nil { + t.Fatalf("newCertReloader(primary): %v", err) + } + stealth, err := newCertReloader(stealthCert, stealthKey, zap.NewNop()) + if err != nil { + t.Fatalf("newCertReloader(stealth): %v", err) + } + + getCert := newGetCertificate(stealthTestDomain, primary, stealth) + + wantPrimary := leafDER(t, primary) + wantStealth := leafDER(t, stealth) + if bytes.Equal(wantPrimary, wantStealth) { + t.Fatal("test setup error: primary and stealth certs must be distinct") + } + + tests := []struct { + name string + serverName string + want []byte + }{ + {"stealth SNI selects stealth cert", stealthTestDomain, wantStealth}, + {"stealth SNI is case-insensitive", strings.ToUpper(stealthTestDomain), wantStealth}, + {"turn domain SNI selects primary cert", turnTestDomain, wantPrimary}, + {"empty SNI selects primary cert", "", wantPrimary}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := certLeafForSNI(t, getCert, tt.serverName) + if !bytes.Equal(got, tt.want) { + t.Errorf("ServerName=%q served the wrong certificate", tt.serverName) + } + }) + } +} + +func TestGetCertificate_stealthDisabledAlwaysPrimary(t *testing.T) { + dir := t.TempDir() + primaryCert, primaryKey := writeNamedCert(t, dir, "primary") + primary, err := newCertReloader(primaryCert, primaryKey, zap.NewNop()) + if err != nil { + t.Fatalf("newCertReloader(primary): %v", err) + } + + // Stealth disabled (nil reloader): every SNI — including a string that looks + // like a stealth host — must serve the primary cert unchanged. + getCert := newGetCertificate("", primary, nil) + want := leafDER(t, primary) + + for _, serverName := range []string{"", turnTestDomain, stealthTestDomain} { + if got := certLeafForSNI(t, getCert, serverName); !bytes.Equal(got, want) { + t.Errorf("ServerName=%q must serve the primary cert when stealth is disabled", serverName) + } + } +} + +func baseStealthConfig(t *testing.T) *Config { + t.Helper() + dir := t.TempDir() + primaryCert, primaryKey := writeNamedCert(t, dir, "primary") + return &Config{ + ListenAddr: "127.0.0.1:0", + TURNSListenAddr: "127.0.0.1:0", + TLSCertPath: primaryCert, + TLSKeyPath: primaryKey, + PublicIP: "127.0.0.1", + Realm: "orama-devnet.network", + AuthSecret: "test-secret-key", + RelayPortStart: 49152, + RelayPortEnd: 50000, + Namespace: "test-ns", + } +} + +func TestServer_partialStealthConfigFails(t *testing.T) { + tests := []struct { + name string + mutate func(c *Config) + wantMissing []string + }{ + { + name: "only stealth_domain set", + mutate: func(c *Config) { c.StealthDomain = stealthTestDomain }, + wantMissing: []string{"tls_stealth_cert_path", "tls_stealth_key_path"}, + }, + { + name: "domain and cert set, key missing", + mutate: func(c *Config) { c.StealthDomain = stealthTestDomain; c.TLSStealthCertPath = "/tmp/x.pem" }, + wantMissing: []string{"tls_stealth_key_path"}, + }, + { + name: "only cert path set", + mutate: func(c *Config) { c.TLSStealthCertPath = "/tmp/x.pem" }, + wantMissing: []string{"stealth_domain", "tls_stealth_key_path"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cfg := baseStealthConfig(t) + tt.mutate(cfg) + + srv, err := NewServer(cfg, zap.NewNop()) + if err == nil { + srv.Close() + t.Fatal("expected startup to fail on partial stealth config") + } + for _, field := range tt.wantMissing { + if !strings.Contains(err.Error(), field) { + t.Errorf("error must name the missing field %q; got: %v", field, err) + } + } + }) + } +} + +func TestServer_missingStealthCertFails(t *testing.T) { + cfg := baseStealthConfig(t) + cfg.StealthDomain = stealthTestDomain + cfg.TLSStealthCertPath = filepath.Join(t.TempDir(), "absent-cert.pem") + cfg.TLSStealthKeyPath = filepath.Join(t.TempDir(), "absent-key.pem") + + srv, err := NewServer(cfg, zap.NewNop()) + if err == nil { + srv.Close() + t.Fatal("expected startup to fail when the stealth cert file is absent") + } + if !strings.Contains(err.Error(), cfg.TLSStealthCertPath) { + t.Errorf("error must name the missing stealth cert path %q; got: %v", cfg.TLSStealthCertPath, err) + } +} + +func TestServer_fullStealthConfigStarts(t *testing.T) { + cfg := baseStealthConfig(t) + dir := t.TempDir() + stealthCert, stealthKey := writeNamedCert(t, dir, "stealth") + cfg.StealthDomain = stealthTestDomain + cfg.TLSStealthCertPath = stealthCert + cfg.TLSStealthKeyPath = stealthKey + + srv, err := NewServer(cfg, zap.NewNop()) + if err != nil { + t.Fatalf("expected startup to succeed with full stealth config: %v", err) + } + defer srv.Close() + if srv.stealthCertReloader == nil { + t.Error("stealthCertReloader must be set when stealth is fully configured") + } +} diff --git a/core/pkg/turn/stealth_test.go b/core/pkg/turn/stealth_test.go new file mode 100644 index 0000000..5ed36ad --- /dev/null +++ b/core/pkg/turn/stealth_test.go @@ -0,0 +1,53 @@ +package turn + +import ( + "regexp" + "strings" + "testing" +) + +func TestStealthHostForNamespace_deterministic(t *testing.T) { + a := StealthHostForNamespace("anchat-test", "orama-devnet.network") + b := StealthHostForNamespace("anchat-test", "orama-devnet.network") + if a != b { + t.Fatalf("not deterministic: %q vs %q", a, b) + } + if !strings.HasPrefix(a, "cdn-") || !strings.HasSuffix(a, ".orama-devnet.network") { + t.Errorf("unexpected shape: %q", a) + } + // label = "cdn-" + 12 hex chars + label := strings.SplitN(a, ".", 2)[0] + if len(label) != len("cdn-")+stealthHostHashBytes*2 { + t.Errorf("label %q has wrong length", label) + } +} + +func TestStealthHostForNamespace_namespaceNotLeaked(t *testing.T) { + h := StealthHostForNamespace("anchat-test", "orama-devnet.network") + if strings.Contains(h, "anchat") { + t.Errorf("stealth host %q leaks the namespace name", h) + } +} + +func TestStealthHostForNamespace_distinctPerNamespace(t *testing.T) { + a := StealthHostForNamespace("ns-a", "example.com") + b := StealthHostForNamespace("ns-b", "example.com") + if a == b { + t.Fatalf("different namespaces produced the same stealth host %q", a) + } +} + +// TestStealthHostForNamespace_matchesDNSNameAllowlist guards the contract that +// the derived host always passes the Caddyfile DNS-name allowlist +// (pkg/namespace turn_cert.go dnsNamePattern) — a legitimate stealth domain +// must never be rejected by that defense-in-depth check. Mirrors the same +// conservative pattern here to avoid an import cycle. +func TestStealthHostForNamespace_matchesDNSNameAllowlist(t *testing.T) { + dnsName := regexp.MustCompile(`^[a-z0-9]([a-z0-9-]*[a-z0-9])?(\.[a-z0-9]([a-z0-9-]*[a-z0-9])?)+$`) + for _, ns := range []string{"anchat-test", "a", "ns-with-many-dashes", "x1y2z3"} { + h := StealthHostForNamespace(ns, "orama-devnet.network") + if !dnsName.MatchString(h) { + t.Errorf("derived stealth host %q for ns %q fails the DNS-name allowlist", h, ns) + } + } +}