feat(gateway): implement ntfy cluster fan-out and improve secrets encryption

- Add `ntfyFanoutResolver` to distribute push notifications across all active cluster nodes, ensuring delivery when nodes lack shared state.
- Refactor secrets encryption key derivation to use cluster-wide secrets via HKDF, replacing ephemeral per-node keys to fix cross-node decryption issues.
- Add unit tests for fan-out resolution logic and caching behavior.
This commit is contained in:
anonpenguin23 2026-06-13 09:23:14 +03:00
parent 4ae8fa941d
commit 34f9da6f8d
11 changed files with 1058 additions and 186 deletions

View File

@ -5,6 +5,7 @@ import (
"database/sql" "database/sql"
"fmt" "fmt"
"net" "net"
"net/url"
"os" "os"
"path/filepath" "path/filepath"
"strings" "strings"
@ -478,15 +479,21 @@ func initializeServerless(logger *logging.ColoredLogger, cfg *Config, deps *Depe
// Create secrets manager for serverless functions (AES-256-GCM encrypted). // Create secrets manager for serverless functions (AES-256-GCM encrypted).
// //
// The encryption key comes from the gateway Config (loaded from // The encryption key is DERIVED from the cluster secret via HKDF
// ~/.orama/secrets/secrets-encryption-key), NOT from engineCfg — engineCfg // (resolveSecretsEncryptionKeyHex), so every gateway in the cluster computes
// never has the key set, so passing it always produced a per-process // the identical key and a secret written on one node decrypts on every other
// ephemeral key and made get_secret return undecryptable values // node and survives rolling upgrades. This replaces the old per-node
// (bugboard #837). allowEphemeral=false: a missing/invalid key fails // crypto/rand key file, whose divergence across an upgraded cluster kept
// get_secret broken (bugboard #837). The file key (cfg.SecretsEncryptionKey)
// remains only as a fallback when no cluster secret is available (legacy /
// single-node test rigs). allowEphemeral=false: a missing/invalid key fails
// loudly here and disables get_secret rather than silently corrupting // loudly here and disables get_secret rather than silently corrupting
// secrets. // secrets.
var secretsMgr serverless.SecretsManager var secretsMgr serverless.SecretsManager
if smImpl, secretsErr := hostfunctions.NewDBSecretsManager(deps.ORMClient, cfg.SecretsEncryptionKey, false, logger.Logger); secretsErr != nil { if secretsKeyHex, keyErr := resolveSecretsEncryptionKeyHex(cfg.ClusterSecret, cfg.SecretsEncryptionKey); keyErr != nil {
logger.ComponentWarn(logging.ComponentGeneral, "Failed to derive secrets encryption key; get_secret will be unavailable",
zap.Error(keyErr))
} else if smImpl, secretsErr := hostfunctions.NewDBSecretsManager(deps.ORMClient, secretsKeyHex, false, logger.Logger); secretsErr != nil {
logger.ComponentWarn(logging.ComponentGeneral, "Failed to initialize secrets manager; get_secret will be unavailable", logger.ComponentWarn(logging.ComponentGeneral, "Failed to initialize secrets manager; get_secret will be unavailable",
zap.Error(secretsErr)) zap.Error(secretsErr))
} else { } else {
@ -504,7 +511,7 @@ func initializeServerless(logger *logging.ColoredLogger, cfg *Config, deps *Depe
// //
// PushDispatcher (legacy) is set only when YAML defaults exist — // PushDispatcher (legacy) is set only when YAML defaults exist —
// kept for back-compat with code that hasn't migrated to Manager. // kept for back-compat with code that hasn't migrated to Manager.
pushDispatcher, pushStore, pushManager, pushCfgStore, pushCredManager, err := buildPushDispatcher(cfg, deps.ORMClient, logger) pushDispatcher, pushStore, pushManager, pushCfgStore, pushCredManager, err := buildPushDispatcher(cfg, deps.ORMClient, deps.Client, logger)
if err != nil { if err != nil {
// Non-fatal: log and continue. Functions calling push_send will get nil // Non-fatal: log and continue. Functions calling push_send will get nil
// (silent no-op) and HTTP /v1/push/* endpoints return 503. // (silent no-op) and HTTP /v1/push/* endpoints return 503.
@ -921,6 +928,7 @@ func appendRQLiteQueryParams(dsn string) string {
func buildPushDispatcher( func buildPushDispatcher(
cfg *Config, cfg *Config,
db rqlite.Client, db rqlite.Client,
globalDB client.NetworkClient,
logger *logging.ColoredLogger, logger *logging.ColoredLogger,
) (*push.PushDispatcher, push.PushDeviceStore, *push.Manager, push.ConfigStore, *pushcreds.Manager, error) { ) (*push.PushDispatcher, push.PushDeviceStore, *push.Manager, push.ConfigStore, *pushcreds.Manager, error) {
if cfg.ClusterSecret == "" { if cfg.ClusterSecret == "" {
@ -957,6 +965,25 @@ func buildPushDispatcher(
pushcreds.Register(pushapns.NewValidator()) pushcreds.Register(pushapns.NewValidator())
pushcreds.Register(pushntfy.NewValidator()) pushcreds.Register(pushntfy.NewValidator())
// ntfy cluster fan-out (bugboard #858): the default push infra runs an
// independent ntfy per node with no shared store, so a publish must reach
// EVERY active node for the subscriber's instance (picked by round-robin
// DNS) to receive it. Build a resolver over the global dns_nodes table; the
// factory attaches it only to providers using the shared default base URL
// (a namespace pointing ntfy at its own server is never fanned across our
// cluster). nil globalDB or an unparseable base URL → no fan-out (provider
// falls back to the single base URL).
var ntfyFanout *ntfyFanoutResolver
var ntfyFanoutHost string
if globalDB != nil {
if base := strings.TrimSpace(cfg.NtfyBaseURL); base != "" {
if u, perr := url.Parse(base); perr == nil && u.Hostname() != "" {
ntfyFanoutHost = u.Hostname()
ntfyFanout = newNtfyFanoutResolver(globalDB, u.Scheme, u.Port(), defaultNtfyFanoutTTL)
}
}
}
// ProviderFactory turns a resolved Config into the right set of // ProviderFactory turns a resolved Config into the right set of
// provider instances. Lives here in dependencies.go because this is // provider instances. Lives here in dependencies.go because this is
// the only place that imports both the manager package and the // the only place that imports both the manager package and the
@ -997,6 +1024,13 @@ func buildPushDispatcher(
} }
} }
if ntfyCfg.BaseURL != "" { if ntfyCfg.BaseURL != "" {
// Fan out across all push nodes ONLY for the shared default infra.
// A namespace that overrode BaseURL with its own ntfy server keeps
// single-host delivery (its server, not our cluster).
if ntfyFanout != nil && ntfyCfg.BaseURL == cfg.NtfyBaseURL {
ntfyCfg.FanoutResolver = ntfyFanout.Hosts
ntfyCfg.FanoutHostHeader = ntfyFanoutHost
}
ps = append(ps, pushntfy.New(ntfyCfg, logger.Logger)) ps = append(ps, pushntfy.New(ntfyCfg, logger.Logger))
} }
if c.ExpoAccessToken != "" { if c.ExpoAccessToken != "" {

View File

@ -0,0 +1,95 @@
package gateway
import (
"context"
"fmt"
"sync"
"time"
"github.com/DeBrosOfficial/network/pkg/client"
)
// defaultNtfyFanoutTTL bounds how long the active-push-node list is cached
// before re-querying dns_nodes. Matches the DNS heartbeat cadence, so a node
// added/removed is picked up within a heartbeat without hammering rqlite on
// every push.
const defaultNtfyFanoutTTL = 30 * time.Second
// ntfyFanoutResolver resolves the set of ntfy publish base URLs (one per active
// push node) for fan-out delivery, caching the result for a short TTL. Each
// node runs an independent ntfy with no shared store, so a publish must reach
// every node for the subscriber's instance to receive it (bugboard #858).
type ntfyFanoutResolver struct {
// query returns the public IPs of the currently-active push nodes. Injected
// so the cache/transform logic is unit-testable without a live cluster.
query func(ctx context.Context) ([]string, error)
scheme string // "https" (prod) / "http" (dev), from the configured base URL
port string // explicit port from the base URL, or "" for the scheme default
ttl time.Duration
mu sync.Mutex
cached []string
cachedAt time.Time
}
// newNtfyFanoutResolver builds a resolver backed by the global dns_nodes table.
func newNtfyFanoutResolver(globalDB client.NetworkClient, scheme, port string, ttl time.Duration) *ntfyFanoutResolver {
return &ntfyFanoutResolver{
scheme: scheme,
port: port,
ttl: ttl,
query: func(ctx context.Context) ([]string, error) {
db := globalDB.Database()
res, err := db.Query(client.WithInternalAuth(ctx), "SELECT ip_address FROM dns_nodes WHERE status = 'active'")
if err != nil {
return nil, fmt.Errorf("query active push nodes: %w", err)
}
if res == nil {
return nil, nil
}
ips := make([]string, 0, len(res.Rows))
for _, row := range res.Rows {
if len(row) == 0 {
continue
}
if ip, ok := row[0].(string); ok && ip != "" {
ips = append(ips, ip)
}
}
return ips, nil
},
}
}
// Hosts returns the cached fan-out base URLs, refreshing from the query when the
// cache is stale. On a query error it returns the last-known list (possibly nil)
// alongside the error, so the caller can decide to fall back to its base URL
// rather than dropping a push.
func (r *ntfyFanoutResolver) Hosts(ctx context.Context) ([]string, error) {
r.mu.Lock()
defer r.mu.Unlock()
if r.cached != nil && time.Since(r.cachedAt) < r.ttl {
return r.cached, nil
}
ips, err := r.query(ctx)
if err != nil {
return r.cached, err
}
hosts := make([]string, 0, len(ips))
suffix := ""
if r.port != "" {
suffix = ":" + r.port
}
for _, ip := range ips {
if ip == "" {
continue
}
hosts = append(hosts, r.scheme+"://"+ip+suffix)
}
r.cached = hosts
r.cachedAt = time.Now()
return hosts, nil
}

View File

@ -0,0 +1,125 @@
package gateway
import (
"context"
"errors"
"testing"
"time"
)
// Bugboard #858 — the fan-out resolver turns active dns_nodes into ntfy publish
// base URLs and caches them for a short TTL. These pin the transform + caching.
func TestNtfyFanoutResolver_buildsSchemeHostPort(t *testing.T) {
r := &ntfyFanoutResolver{
scheme: "https",
port: "",
ttl: time.Minute,
query: func(context.Context) ([]string, error) { return []string{"1.2.3.4", "5.6.7.8"}, nil },
}
hosts, err := r.Hosts(context.Background())
if err != nil {
t.Fatalf("Hosts: %v", err)
}
want := []string{"https://1.2.3.4", "https://5.6.7.8"}
if len(hosts) != len(want) {
t.Fatalf("got %v; want %v", hosts, want)
}
for i := range want {
if hosts[i] != want[i] {
t.Errorf("host[%d] = %q; want %q", i, hosts[i], want[i])
}
}
}
func TestNtfyFanoutResolver_includesExplicitPort(t *testing.T) {
r := &ntfyFanoutResolver{
scheme: "http",
port: "8090",
ttl: time.Minute,
query: func(context.Context) ([]string, error) { return []string{"10.0.0.6"}, nil },
}
hosts, _ := r.Hosts(context.Background())
if len(hosts) != 1 || hosts[0] != "http://10.0.0.6:8090" {
t.Errorf("got %v; want [http://10.0.0.6:8090]", hosts)
}
}
func TestNtfyFanoutResolver_skipsEmptyIPs(t *testing.T) {
r := &ntfyFanoutResolver{
scheme: "https",
ttl: time.Minute,
query: func(context.Context) ([]string, error) { return []string{"", "1.2.3.4", ""}, nil },
}
hosts, _ := r.Hosts(context.Background())
if len(hosts) != 1 || hosts[0] != "https://1.2.3.4" {
t.Errorf("got %v; want only the non-empty IP", hosts)
}
}
func TestNtfyFanoutResolver_cachesWithinTTL(t *testing.T) {
calls := 0
r := &ntfyFanoutResolver{
scheme: "https",
ttl: time.Minute,
query: func(context.Context) ([]string, error) {
calls++
return []string{"1.2.3.4"}, nil
},
}
for i := 0; i < 3; i++ {
if _, err := r.Hosts(context.Background()); err != nil {
t.Fatalf("Hosts: %v", err)
}
}
if calls != 1 {
t.Errorf("query called %d times; want 1 (cached within TTL)", calls)
}
}
func TestNtfyFanoutResolver_requeriesAfterTTL(t *testing.T) {
calls := 0
r := &ntfyFanoutResolver{
scheme: "https",
ttl: time.Nanosecond, // expire immediately
query: func(context.Context) ([]string, error) {
calls++
return []string{"1.2.3.4"}, nil
},
}
_, _ = r.Hosts(context.Background())
time.Sleep(time.Millisecond)
_, _ = r.Hosts(context.Background())
if calls != 2 {
t.Errorf("query called %d times; want 2 (TTL expired between calls)", calls)
}
}
func TestNtfyFanoutResolver_queryError_returnsStaleCache(t *testing.T) {
fail := false
r := &ntfyFanoutResolver{
scheme: "https",
ttl: time.Nanosecond,
query: func(context.Context) ([]string, error) {
if fail {
return nil, errors.New("rqlite unreachable")
}
return []string{"1.2.3.4"}, nil
},
}
// Prime the cache.
if _, err := r.Hosts(context.Background()); err != nil {
t.Fatalf("prime: %v", err)
}
time.Sleep(time.Millisecond)
// Now the query fails — Hosts must return the stale cache alongside the error
// so the caller can fall back rather than drop the push.
fail = true
hosts, err := r.Hosts(context.Background())
if err == nil {
t.Fatal("want the query error surfaced")
}
if len(hosts) != 1 || hosts[0] != "https://1.2.3.4" {
t.Errorf("want the stale cache returned on error; got %v", hosts)
}
}

View File

@ -0,0 +1,49 @@
package gateway
import (
"encoding/hex"
"strings"
"github.com/DeBrosOfficial/network/pkg/secrets"
)
// secretsEncryptionDerivePurpose is the HKDF info label used to derive the
// function-secrets AES-256 key from the cluster secret. Deriving it (instead of
// generating a per-node crypto/rand key file) guarantees every gateway in the
// cluster computes the IDENTICAL key, so a secret written on one node decrypts
// on every other node and survives rolling upgrades — eliminating the
// key-divergence / convergence-window class that kept get_secret broken for
// days (bugboard #837). Same pattern as the cluster-wide JWT signing key
// (jwtEdDSADerivePurpose) and the TURN encryption key ("turn-encryption").
//
// Bumping the version label (e.g. "...-v2") is a DELIBERATE rotation that
// invalidates every stored function secret (they must be re-`set`). It must
// never be changed casually.
const secretsEncryptionDerivePurpose = "orama-secrets-encryption-v1"
// resolveSecretsEncryptionKeyHex returns the hex-encoded AES-256 key the
// serverless secrets manager should use to encrypt/decrypt function secrets.
//
// Primary: derive deterministically from the cluster secret via HKDF, so the
// key is identical on every gateway in the cluster and stable across restarts
// and rolling upgrades. The cluster secret is TrimSpace'd first so a stray
// trailing newline on one node's secret file can't silently diverge its derived
// key from the rest of the cluster (the host gateway reads the file untrimmed
// while the namespace gateway trims it — without this they could derive
// different keys and reintroduce #837).
//
// Fallback: when no cluster secret is available (single-node test rigs / legacy
// deployments without a shared secret), fall back to an explicitly-configured
// key file. An empty result then makes the production secrets manager fail loud
// (NewDBSecretsManager with allowEphemeral=false), rather than silently using a
// per-process ephemeral key.
func resolveSecretsEncryptionKeyHex(clusterSecret, fileKeyHex string) (string, error) {
if cs := strings.TrimSpace(clusterSecret); cs != "" {
key, err := secrets.DeriveKey(cs, secretsEncryptionDerivePurpose)
if err != nil {
return "", err
}
return hex.EncodeToString(key), nil
}
return strings.TrimSpace(fileKeyHex), nil
}

View File

@ -0,0 +1,95 @@
package gateway
import (
"encoding/hex"
"testing"
"github.com/DeBrosOfficial/network/pkg/secrets"
)
// Bugboard #837 — the function-secrets AES key must be DERIVED from the cluster
// secret (not a per-node random file), so every gateway computes the identical
// key and stored secrets survive rolling upgrades. These pin the derivation.
func TestResolveSecretsEncryptionKeyHex_deterministic(t *testing.T) {
// Same cluster secret → byte-identical key, every time. This is the whole
// point: any gateway in the cluster derives the same key, so a secret set on
// one node decrypts on all others.
const cs = "cluster-secret-abc123"
a, err := resolveSecretsEncryptionKeyHex(cs, "")
if err != nil {
t.Fatalf("resolve: %v", err)
}
b, err := resolveSecretsEncryptionKeyHex(cs, "")
if err != nil {
t.Fatalf("resolve: %v", err)
}
if a == "" || a != b {
t.Fatalf("derivation not deterministic: %q vs %q", a, b)
}
// Valid AES-256 key: 32 bytes = 64 hex chars.
raw, err := hex.DecodeString(a)
if err != nil || len(raw) != 32 {
t.Errorf("derived key is not 32-byte hex: len(raw)=%d err=%v", len(raw), err)
}
}
func TestResolveSecretsEncryptionKeyHex_trimInvariant(t *testing.T) {
// A trailing newline on one node's cluster-secret file must NOT change the
// derived key — otherwise the host gateway (reads untrimmed) and a namespace
// gateway (reads trimmed) would diverge and reintroduce #837.
trimmed, _ := resolveSecretsEncryptionKeyHex("cluster-secret-abc123", "")
withNL, _ := resolveSecretsEncryptionKeyHex("cluster-secret-abc123\n", "")
withSpaces, _ := resolveSecretsEncryptionKeyHex(" cluster-secret-abc123\t\n", "")
if trimmed != withNL || trimmed != withSpaces {
t.Errorf("derived key is not whitespace-invariant: %q / %q / %q", trimmed, withNL, withSpaces)
}
}
func TestResolveSecretsEncryptionKeyHex_distinctSecretsDistinctKeys(t *testing.T) {
a, _ := resolveSecretsEncryptionKeyHex("cluster-secret-A", "")
b, _ := resolveSecretsEncryptionKeyHex("cluster-secret-B", "")
if a == b {
t.Errorf("distinct cluster secrets must derive distinct keys; both = %q", a)
}
}
func TestResolveSecretsEncryptionKeyHex_purposeSeparatedFromTURN(t *testing.T) {
// The secrets key must NOT equal the TURN key derived from the same cluster
// secret — domain separation via the HKDF info label.
const cs = "cluster-secret-abc123"
secretsHex, _ := resolveSecretsEncryptionKeyHex(cs, "")
turnKey, err := secrets.DeriveKey(cs, "turn-encryption")
if err != nil {
t.Fatalf("derive turn key: %v", err)
}
if secretsHex == hex.EncodeToString(turnKey) {
t.Error("secrets key collides with the TURN key — HKDF purpose label not providing domain separation")
}
}
func TestResolveSecretsEncryptionKeyHex_emptyClusterSecretUsesFileKey(t *testing.T) {
// Legacy/test rigs with no cluster secret fall back to the explicitly
// configured file key (trimmed).
const fileKey = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"
got, err := resolveSecretsEncryptionKeyHex("", fileKey+"\n")
if err != nil {
t.Fatalf("resolve: %v", err)
}
if got != fileKey {
t.Errorf("empty cluster secret should return the trimmed file key; got %q", got)
}
}
func TestResolveSecretsEncryptionKeyHex_emptyBothReturnsEmpty(t *testing.T) {
// No cluster secret AND no file key → empty result, which makes the
// production secrets manager fail loud (allowEphemeral=false) instead of
// silently using an ephemeral key.
got, err := resolveSecretsEncryptionKeyHex("", "")
if err != nil {
t.Fatalf("resolve: %v", err)
}
if got != "" {
t.Errorf("want empty result when neither source has a key; got %q", got)
}
}

View File

@ -1785,15 +1785,24 @@ func (cm *ClusterManager) saveLocalState(state *ClusterLocalState) error {
return fmt.Errorf("failed to marshal state: %w", err) return fmt.Errorf("failed to marshal state: %w", err)
} }
path := filepath.Join(dir, "cluster-state.json") path := filepath.Join(dir, "cluster-state.json")
// 0600: this file now carries the namespace TURN shared secret for // Atomic write: this file now carries the namespace TURN shared secret
// cold-start resilience (bugboard #130), so it must not be world/group // (bugboard #130) and is rewritten from multiple converge paths. Write a
// readable. WriteFile's mode only applies on create — chmod explicitly so a // temp file then rename over the target so a reader (or a concurrent
// file written 0644 by an older release is tightened on the next rewrite. // writer) never observes a half-written secret — rename is atomic on the
if err := os.WriteFile(path, data, 0600); err != nil { // same filesystem. 0600 + chmod on the temp file keeps the secret out of
return fmt.Errorf("failed to write state file: %w", err) // world/group read; the rename then makes the live file 0600 too, which
// also tightens a file an older release left at 0644.
tmp := path + ".tmp"
if err := os.WriteFile(tmp, data, 0600); err != nil {
return fmt.Errorf("failed to write temp state file: %w", err)
} }
if err := os.Chmod(path, 0600); err != nil { if err := os.Chmod(tmp, 0600); err != nil {
return fmt.Errorf("failed to set state file permissions: %w", err) os.Remove(tmp)
return fmt.Errorf("failed to set temp state file permissions: %w", err)
}
if err := os.Rename(tmp, path); err != nil {
os.Remove(tmp)
return fmt.Errorf("failed to rename state file into place: %w", err)
} }
cm.logger.Info("Saved cluster local state", zap.String("namespace", state.NamespaceName), zap.String("path", path)) cm.logger.Info("Saved cluster local state", zap.String("namespace", state.NamespaceName), zap.String("path", path))
return nil return nil
@ -1855,6 +1864,29 @@ const (
webrtcResolveRetryDelay = 2 * time.Second webrtcResolveRetryDelay = 2 * time.Second
) )
// resolveWebRTCConfigWithRetry calls fetch up to `retries` times, sleeping
// `delay` between attempts, and returns the first result whose error is nil. A
// distant/just-restarted node's namespace rqlite can take a few seconds to
// become readable; without the retry the read fails once and the gateway comes
// up with TURN disabled (bugboard #130). A genuine decrypt failure (stale
// cluster-secret) also errors and exhausts the retries, returning the final
// error so the caller can mark the result unresolved. `sleep` is injected so
// unit tests exercise the loop without real delay.
func resolveWebRTCConfigWithRetry(retries int, delay time.Duration, sleep func(time.Duration), fetch func() (*WebRTCConfig, error)) (*WebRTCConfig, error) {
var cfg *WebRTCConfig
var err error
for attempt := 0; attempt < retries; attempt++ {
cfg, err = fetch()
if err == nil {
return cfg, nil
}
if attempt < retries-1 {
sleep(delay)
}
}
return cfg, err
}
// applyResolvedWebRTCToState copies a freshly-resolved WebRTC config into the // applyResolvedWebRTCToState copies a freshly-resolved WebRTC config into the
// local cluster state so a future cold start can read the TURN secret from disk // local cluster state so a future cold start can read the TURN secret from disk
// instead of the (possibly-slow) namespace rqlite (bugboard #130). Returns true // instead of the (possibly-slow) namespace rqlite (bugboard #130). Returns true
@ -1886,12 +1918,13 @@ type restoreWebRTC struct {
turnDomain string turnDomain string
turnSecret string turnSecret string
stealthDomain string // feat-124: empty when webrtc stealth is disabled stealthDomain string // feat-124: empty when webrtc stealth is disabled
// unresolved is true when the state file had no TURN secret AND the DB // unresolved is true when the DB lookup ERRORED (vs. resolved-but-not-
// fallback ERRORED (vs. resolved-but-not-enabled). The caller must NOT // enabled) AND the local cache had no secret to fall back to. The caller
// write a WebRTC-disabled gateway config off an unresolved lookup — that // must NOT write a WebRTC-disabled gateway config off an unresolved
// silently kills turn.credentials on a node that should serve TURN // lookup — that silently kills turn.credentials on a node that should
// (bugboard #130: a decrypt failure after cluster-secret rotation was // serve TURN (bugboard #130: a decrypt failure after cluster-secret
// swallowed into "disabled"). enabled is always false when unresolved. // rotation was swallowed into "disabled"). enabled is always false when
// unresolved.
unresolved bool unresolved bool
} }
@ -1905,9 +1938,18 @@ type restoreWebRTC struct {
// - SFU (sfuPort) is PER-NODE — non-zero only when this node runs a // - SFU (sfuPort) is PER-NODE — non-zero only when this node runs a
// local SFU (for /v1/webrtc/signal + /rooms proxying). // local SFU (for /v1/webrtc/signal + /rooms proxying).
// //
// Precedence: prefer the local state file; fall back to the DB (source of // Precedence: DB-FIRST. The namespace_webrtc_config row is the source of
// truth) when the state file lacks the TURN secret (the namespace-wide // truth for the CURRENT TURN secret, so we always consult it. The local
// "webrtc is enabled" marker). dbFetch is lazy — only hit when needed. // cluster-state.json cache (dbFetch's counterpart) is a FALLBACK ONLY —
// used when the DB read fails (a slow/just-restarted node whose namespace
// rqlite has not synced yet). This is the bugboard #130 FOLLOW-UP fix: the
// earlier state-FIRST read short-circuited the DB whenever the cache held a
// secret and so NEVER re-validated a present-but-stale cached secret. If a
// secret was rotated (disable→enable) while a node was offline, that node
// kept serving the OLD secret indefinitely. DB-first means a stale cache
// can survive at most until the DB becomes readable on the next converge —
// never indefinitely — while still letting a genuinely DB-down node come up
// on TURN via the cache (the #130 resilience the cache was added for).
// //
// `enabled` is true when EITHER a TURN secret OR an SFU port is present, // `enabled` is true when EITHER a TURN secret OR an SFU port is present,
// so the caller knows to write a webrtc block. A non-SFU gateway gets // so the caller knows to write a webrtc block. A non-SFU gateway gets
@ -1920,52 +1962,45 @@ func chooseRestoreWebRTC(
stateHasSFU bool, stateSFUPort int, stateTURNDomain, stateTURNSecret, stateStealthDomain string, stateHasSFU bool, stateSFUPort int, stateTURNDomain, stateTURNSecret, stateStealthDomain string,
dbFetch func() (turnSecret, turnDomain, stealthDomain string, sfuPort int, resolved bool), dbFetch func() (turnSecret, turnDomain, stealthDomain string, sfuPort int, resolved bool),
) restoreWebRTC { ) restoreWebRTC {
turnSecret := stateTURNSecret // DB-first: consult the source of truth before trusting the local cache.
turnDomain := stateTURNDomain dbSecret, dbDomain, dbStealth, dbSFU, resolved := dbFetch()
stealthDomain := stateStealthDomain if resolved {
// The DB read landed and is authoritative. dbSecret == "" means the
// namespace genuinely has no WebRTC enabled — honor that (disable),
// do NOT fall back to a possibly-stale cached secret. A present
// secret is the CURRENT one and wins over any cached value.
if dbSecret == "" {
return restoreWebRTC{}
}
return restoreWebRTC{
enabled: true,
sfuPort: dbSFU,
turnDomain: dbDomain,
turnSecret: dbSecret,
stealthDomain: dbStealth,
}
}
// The DB/decrypt lookup ERRORED (slow node whose namespace rqlite is not
// readable yet, or a decrypt failure after a cluster-secret rotation).
// Fall back to the locally-cached secret so TURN still comes up — possibly
// stale, but functional, and self-correcting on the next converge once the
// DB is readable (NOT indefinite). If the cache is empty too, signal
// unresolved so the caller preserves the running gateway config instead of
// blanking TURN (bugboard #130).
sfuPort := 0 sfuPort := 0
if stateHasSFU && stateSFUPort > 0 { if stateHasSFU && stateSFUPort > 0 {
sfuPort = stateSFUPort sfuPort = stateSFUPort
} }
if stateTURNSecret == "" && sfuPort == 0 {
// Fall back to the DB when the state file has no TURN secret — that's return restoreWebRTC{unresolved: true}
// the marker that the namespace has WebRTC enabled at all. The state
// file is not updated by EnableWebRTC, so a namespace enabled after
// the state file was written reaches here with an empty secret.
// (Stealth toggles DO rewrite cluster state on every node, so the
// state-first read stays fresh for stealthDomain too.)
unresolved := false
if turnSecret == "" {
dbSecret, dbDomain, dbStealth, dbSFU, resolved := dbFetch()
switch {
case !resolved:
// The DB/decrypt lookup ERRORED — we do not know whether WebRTC
// is enabled. This is DISTINCT from resolved-but-empty (genuinely
// disabled). Signal unresolved so the caller preserves the
// running config instead of writing a TURN-disabled one
// (bugboard #130).
unresolved = true
case dbSecret != "":
turnSecret = dbSecret
if turnDomain == "" {
turnDomain = dbDomain
} }
if stealthDomain == "" {
stealthDomain = dbStealth
}
if sfuPort == 0 {
sfuPort = dbSFU
}
}
}
return restoreWebRTC{ return restoreWebRTC{
enabled: !unresolved && (turnSecret != "" || sfuPort > 0), enabled: stateTURNSecret != "" || sfuPort > 0,
unresolved: unresolved,
sfuPort: sfuPort, sfuPort: sfuPort,
turnDomain: turnDomain, turnDomain: stateTURNDomain,
turnSecret: turnSecret, turnSecret: stateTURNSecret,
stealthDomain: stealthDomain, stealthDomain: stateStealthDomain,
} }
} }
@ -2114,12 +2149,12 @@ func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *Cl
SecretsEncryptionKey: cm.secretsEncryptionKey, SecretsEncryptionKey: cm.secretsEncryptionKey,
} }
// Resolve WebRTC config. Prefer the local state file; fall back to // Resolve WebRTC config. DB-FIRST (source of truth for the CURRENT
// the DB (source of truth) to self-heal stale state. Bugboard #25 — // secret); the local state cache is consulted only when the DB read
// the state file is NOT updated by EnableWebRTC, so a namespace // fails (bugboard #130 follow-up — see chooseRestoreWebRTC). Bugboard
// enabled AFTER its state file was written carries no SFU/TURN // #25 — the state file is NOT updated by EnableWebRTC, so a namespace
// fields here. The lazy dbFetch only hits the DB when the state // enabled AFTER its state file was written carries no SFU/TURN fields
// file is incomplete. // here; reading the DB re-materializes them.
wr := chooseRestoreWebRTC( wr := chooseRestoreWebRTC(
state.HasSFU, state.SFUSignalingPort, state.TURNDomain, state.TURNSharedSecret, state.TURNStealthDomain, state.HasSFU, state.SFUSignalingPort, state.TURNDomain, state.TURNSharedSecret, state.TURNStealthDomain,
func() (turnSecret, turnDomain, stealthDomain string, sfuPort int, resolved bool) { func() (turnSecret, turnDomain, stealthDomain string, sfuPort int, resolved bool) {
@ -2132,17 +2167,11 @@ func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *Cl
// decrypt failure (stale key) also errors here and will exhaust // decrypt failure (stale key) also errors here and will exhaust
// the retries → unresolved → the caller preserves the running // the retries → unresolved → the caller preserves the running
// config rather than blanking it. // config rather than blanking it.
var webrtcCfg *WebRTCConfig webrtcCfg, err := resolveWebRTCConfigWithRetry(
var err error webrtcResolveRetries, webrtcResolveRetryDelay, time.Sleep,
for attempt := 0; attempt < webrtcResolveRetries; attempt++ { func() (*WebRTCConfig, error) {
webrtcCfg, err = cm.GetWebRTCConfig(ctx, state.NamespaceName) return cm.GetWebRTCConfig(ctx, state.NamespaceName)
if err == nil { })
break // success — webrtcCfg may be nil (genuinely disabled)
}
if attempt < webrtcResolveRetries-1 {
time.Sleep(webrtcResolveRetryDelay)
}
}
if err != nil { if err != nil {
// Persistent error after retries (slow read that never // Persistent error after retries (slow read that never
// landed, or a decrypt failure). Do NOT swallow into // landed, or a decrypt failure). Do NOT swallow into
@ -2182,13 +2211,15 @@ func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *Cl
gwCfg.TURNSecret = wr.turnSecret gwCfg.TURNSecret = wr.turnSecret
gwCfg.TURNStealthDomain = wr.stealthDomain gwCfg.TURNStealthDomain = wr.stealthDomain
// Cache the resolved secret into THIS node's local state so the // Cache the resolved secret into THIS node's local state so that if
// NEXT cold start reads it from disk (state-first in // the NEXT cold start can't read the namespace rqlite (a distant/
// chooseRestoreWebRTC short-circuits before the DB) instead of // slow node whose follower hasn't synced), chooseRestoreWebRTC can
// depending on a live, possibly-slow namespace-rqlite read — which // fall back to this on-disk secret instead of coming up with TURN
// is exactly what left a distant/slow node's gateway with TURN // disabled (bugboard #130). The cache is a FALLBACK — DB-first
// disabled on restart (bugboard #130). Each node self-heals its own // resolution still prefers the live DB secret whenever it's
// cache on a successful resolve; nothing is sent cross-node. // readable, so this cached value can never pin the node to a stale
// secret. Each node self-heals its own cache on a successful
// resolve; nothing is sent cross-node.
if applyResolvedWebRTCToState(state, wr) { if applyResolvedWebRTCToState(state, wr) {
if err := cm.saveLocalState(state); err != nil { if err := cm.saveLocalState(state); err != nil {
cm.logger.Warn("Failed to cache resolved WebRTC config to local state (cold start may fall back to the DB read next boot)", cm.logger.Warn("Failed to cache resolved WebRTC config to local state (cold start may fall back to the DB read next boot)",
@ -2198,6 +2229,24 @@ func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *Cl
zap.String("namespace", state.NamespaceName)) zap.String("namespace", state.NamespaceName))
} }
} }
} else if !wr.unresolved {
// The DB read RESOLVED that this namespace has NO WebRTC (disabled).
// Clear any stale cached secret from local state so a future cold
// start that hits a transient DB error can't fall back to it and
// resurrect TURN for a disabled namespace — the hole being: a node
// that was offline during DisableWebRTC never received the cleared
// state push and would otherwise keep serving the old secret. Only
// do this on a RESOLVED-disabled read, NEVER on an unresolved
// (DB-error) one — there the cache IS the fallback and must survive.
if applyResolvedWebRTCToState(state, restoreWebRTC{}) {
if err := cm.saveLocalState(state); err != nil {
cm.logger.Warn("Failed to clear stale cached WebRTC secret from local state after DB reported the namespace disabled",
zap.String("namespace", state.NamespaceName), zap.Error(err))
} else {
cm.logger.Info("Cleared stale cached WebRTC secret from local state (namespace disabled in DB)",
zap.String("namespace", state.NamespaceName))
}
}
} }
resp, err := http.Get(fmt.Sprintf("http://localhost:%d/v1/health", pb.GatewayHTTPPort)) resp, err := http.Get(fmt.Sprintf("http://localhost:%d/v1/health", pb.GatewayHTTPPort))

View File

@ -0,0 +1,71 @@
package namespace
import (
"os"
"path/filepath"
"testing"
"go.uber.org/zap"
)
// Bugboard #130 — cluster-state.json carries the namespace TURN shared secret
// (plaintext HMAC), so every writer of it must produce a 0600 file and tighten
// any pre-existing world-readable file on rewrite. SaveClusterState is the
// RECEIVER-side writer that persists state pushed from the coordinator to a
// remote namespace node; without this it landed 0644.
func TestSaveClusterState_writes0600(t *testing.T) {
base := t.TempDir()
s := &SystemdSpawner{namespaceBase: base, logger: zap.NewNop()}
if err := s.SaveClusterState("ns-test", []byte(`{"turn_shared_secret":"sek-123"}`)); err != nil {
t.Fatalf("SaveClusterState: %v", err)
}
path := filepath.Join(base, "ns-test", "cluster-state.json")
info, err := os.Stat(path)
if err != nil {
t.Fatalf("stat cluster-state.json: %v", err)
}
if perm := info.Mode().Perm(); perm != 0600 {
t.Errorf("cluster-state.json mode = %o; want 0600 (it carries the TURN secret)", perm)
}
// No leftover temp file from the atomic write.
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
t.Errorf("temp file should not survive a successful save; stat err = %v", err)
}
}
func TestSaveClusterState_tightensExisting0644(t *testing.T) {
base := t.TempDir()
s := &SystemdSpawner{namespaceBase: base, logger: zap.NewNop()}
// Simulate a file an older release wrote world-readable.
dir := filepath.Join(base, "ns-test")
if err := os.MkdirAll(dir, 0755); err != nil {
t.Fatal(err)
}
path := filepath.Join(dir, "cluster-state.json")
if err := os.WriteFile(path, []byte(`{"old":true}`), 0644); err != nil {
t.Fatal(err)
}
if err := s.SaveClusterState("ns-test", []byte(`{"turn_shared_secret":"sek-new"}`)); err != nil {
t.Fatalf("SaveClusterState: %v", err)
}
info, err := os.Stat(path)
if err != nil {
t.Fatalf("stat cluster-state.json: %v", err)
}
if perm := info.Mode().Perm(); perm != 0600 {
t.Errorf("rewrite did not tighten perms: mode = %o; want 0600", perm)
}
data, err := os.ReadFile(path)
if err != nil {
t.Fatal(err)
}
if string(data) != `{"turn_shared_secret":"sek-new"}` {
t.Errorf("content not replaced atomically: %s", data)
}
}

View File

@ -1,15 +1,23 @@
package namespace package namespace
import "testing" import (
"errors"
"testing"
"time"
)
// Bugboard #25 — WebRTC config drift on restart + TURN/SFU decouple. // Bugboard #25 — WebRTC config drift on restart + TURN/SFU decouple.
// Bugboard #130 follow-up — DB-FIRST resolution so a stale cached secret can
// never be served indefinitely.
// //
// chooseRestoreWebRTC resolves a restored gateway's WebRTC config from the // chooseRestoreWebRTC resolves a restored gateway's WebRTC config DB-FIRST
// local state file (which EnableWebRTC does NOT update) with a DB fallback // (the namespace_webrtc_config row is the source of truth for the current
// (source of truth). It also DECOUPLES the two aspects: TURN (secret + // secret); the local cluster-state.json cache is a FALLBACK consulted only
// domain) is namespace-wide so ANY gateway can serve credentials; the SFU // when the DB read fails (a slow node whose namespace rqlite hasn't synced).
// port is per-node (0 on a gateway-only node). Pins both the drift // It also DECOUPLES the two aspects: TURN (secret + domain) is namespace-wide
// fallback and the non-SFU-gateway case. // so ANY gateway can serve credentials; the SFU port is per-node (0 on a
// gateway-only node). Pins the drift fallback, the non-SFU-gateway case, and
// the DB-first precedence (DB secret wins over a cached/stale one).
// dbFetch signature: () -> (turnSecret, turnDomain, stealthDomain string, sfuPort int, resolved bool). // dbFetch signature: () -> (turnSecret, turnDomain, stealthDomain string, sfuPort int, resolved bool).
// resolved=true means the lookup completed (with or without a config); // resolved=true means the lookup completed (with or without a config);
@ -23,22 +31,39 @@ func dbFull(secret, domain string, sfuPort int) func() (string, string, string,
return func() (string, string, string, int, bool) { return secret, domain, "", sfuPort, true } return func() (string, string, string, int, bool) { return secret, domain, "", sfuPort, true }
} }
func TestChooseRestoreWebRTC_stateFileCompleteWins(t *testing.T) { func TestChooseRestoreWebRTC_dbSecretWinsOverCachedState(t *testing.T) {
// State file has TURN secret → use it, and NEVER consult the DB // THE #130 FOLLOW-UP (staleness) case. The state file holds a cached
// (the lazy dbFetch must not be called — saves a query on the hot // secret, but the DB (source of truth) has a DIFFERENT, current secret —
// restart path). // e.g. the secret was rotated (disable→enable) while this node was offline.
dbCalled := false // DB-first MUST serve the current DB secret, NOT the stale cached one. The
got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "", // old state-first logic short-circuited the DB here and served "old-secret"
func() (string, string, string, int, bool) { dbCalled = true; return dbNone() }) // indefinitely.
got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "old-secret", "cdn-old.dbrs.space",
dbFull("new-secret", "turn.ns-x.dbrs.space", 7800))
if dbCalled { if !got.enabled {
t.Error("DB fetch was called even though the state file had the TURN secret (should short-circuit)") t.Fatal("DB has a current secret; result must be enabled")
} }
if !got.enabled || got.sfuPort != 7800 || got.turnSecret != "state-secret" { if got.turnSecret != "new-secret" {
t.Errorf("want state-file values; got %+v", got) t.Errorf("BUG #130 STALENESS: turnSecret = %q; want new-secret (the current DB value, not the stale cache)", got.turnSecret)
} }
if got.turnDomain != "turn.ns-x.dbrs.space" { if got.sfuPort != 7800 || got.turnDomain != "turn.ns-x.dbrs.space" {
t.Errorf("turnDomain = %q; want state-file value", got.turnDomain) t.Errorf("want DB-derived block; got %+v", got)
}
}
func TestChooseRestoreWebRTC_dbDisabledOverridesCachedSecret(t *testing.T) {
// The cache holds a secret but the DB read completes and reports NO WebRTC
// (the namespace was disabled while this node was offline). DB-first must
// honor the disable, NOT keep serving the stale cached secret.
got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "stale-secret", "",
dbNone) // dbNone = resolved, no config
if got.enabled {
t.Errorf("DB reports disabled: must not keep serving the cached secret; got %+v", got)
}
if got.unresolved {
t.Error("a clean resolved-but-disabled lookup must not be marked unresolved")
} }
} }
@ -84,19 +109,19 @@ func TestChooseRestoreWebRTC_nonSFUGatewayGetsTURNOnly(t *testing.T) {
} }
} }
func TestChooseRestoreWebRTC_stateHasTURNButNoSFU(t *testing.T) { func TestChooseRestoreWebRTC_cachedTurnOnlyFallbackOnDBError(t *testing.T) {
// State file for a non-SFU node: it has the TURN secret but HasSFU is // A non-SFU node holds a cached TURN secret (HasSFU false / port 0) and the
// false / port 0. Must use the state TURN secret with sfuPort=0 and // DB read ERRORS (its namespace rqlite isn't readable yet at cold start).
// NOT consult the DB (TURN secret present = complete enough). // DB-first falls back to the cached secret so the gateway still serves TURN
dbCalled := false // credentials — sfuPort stays 0 (no local SFU). This is the #130 resilience
got := chooseRestoreWebRTC(false, 0, "turn.ns-x.dbrs.space", "state-secret", "", // the cache exists for.
func() (string, string, string, int, bool) { dbCalled = true; return dbNone() }) got := chooseRestoreWebRTC(false, 0, "turn.ns-x.dbrs.space", "state-secret", "", dbError)
if dbCalled {
t.Error("DB fetch called even though state file had the TURN secret")
}
if !got.enabled || got.sfuPort != 0 || got.turnSecret != "state-secret" { if !got.enabled || got.sfuPort != 0 || got.turnSecret != "state-secret" {
t.Errorf("want TURN-only from state (sfuPort 0); got %+v", got) t.Errorf("want cached TURN-only fallback (sfuPort 0); got %+v", got)
}
if got.unresolved {
t.Error("a usable cached secret must not be marked unresolved")
} }
} }
@ -127,16 +152,14 @@ func TestChooseRestoreWebRTC_dbNoSecretStaysDisabled(t *testing.T) {
// --- feat-124 stealth domain restore precedence --- // --- feat-124 stealth domain restore precedence ---
func TestChooseRestoreWebRTC_stealthFromStateFile(t *testing.T) { func TestChooseRestoreWebRTC_stealthFromCacheOnDBError(t *testing.T) {
// Stealth toggles rewrite cluster state, so a fresh state file carries // When the DB read errors, the cache fallback carries the whole block —
// the stealth domain and must win without a DB call. // including the cached stealth domain — so a stealth-enabled namespace
got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "cdn-abc123def456.dbrs.space", // keeps advertising its stealth rung on a cold start that can't reach the
func() (string, string, string, int, bool) { // DB yet.
t.Error("DB fetch called even though state file was complete") got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "cdn-abc123def456.dbrs.space", dbError)
return dbNone() if !got.enabled || got.stealthDomain != "cdn-abc123def456.dbrs.space" {
}) t.Errorf("stealthDomain = %q; want cached value on DB-error fallback; got %+v", got.stealthDomain, got)
if got.stealthDomain != "cdn-abc123def456.dbrs.space" {
t.Errorf("stealthDomain = %q; want state-file value", got.stealthDomain)
} }
} }
@ -188,27 +211,24 @@ func TestChooseRestoreWebRTC_resolvedEmptyIsDisabledNotUnresolved(t *testing.T)
} }
} }
func TestChooseRestoreWebRTC_stateSecretWinsOverDBError(t *testing.T) { func TestChooseRestoreWebRTC_cachedSecretSurvivesDBError(t *testing.T) {
// A node that already holds the TURN secret in its state file must NOT be // A node that holds the TURN secret in its state file must NOT be disabled
// affected by a DB error — it short-circuits before dbFetch and stays // by a flaky/unsynced DB — when the DB read errors, DB-first falls back to
// enabled/resolved. Guards against the #130 fix accidentally disabling // the cached secret and stays enabled (not unresolved). Guards against the
// healthy nodes when the DB is flaky. // #130 fix accidentally disabling nodes when the DB is briefly unreadable.
got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "", got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "", dbError)
func() (string, string, string, int, bool) {
t.Error("DB fetch must not be called when the state file has the secret")
return dbError()
})
if got.unresolved || !got.enabled || got.turnSecret != "state-secret" { if got.unresolved || !got.enabled || got.turnSecret != "state-secret" {
t.Errorf("state-file secret must win and stay enabled/resolved; got %+v", got) t.Errorf("cached secret must survive a DB error and stay enabled; got %+v", got)
} }
} }
func TestChooseRestoreWebRTC_noStealthStaysEmpty(t *testing.T) { func TestChooseRestoreWebRTC_noStealthStaysEmpty(t *testing.T) {
// Stealth disabled everywhere → empty stealthDomain (gateway advertises // Stealth disabled → empty stealthDomain (gateway advertises the baseline
// the baseline 3-rung ladder only). // 3-rung ladder only). Uses the cache-fallback path (DB error) so an
got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "", dbNone) // enabled-but-no-stealth config is exercised end to end.
if got.stealthDomain != "" { got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "", dbError)
t.Errorf("stealthDomain = %q; want empty when stealth is disabled", got.stealthDomain) if !got.enabled || got.stealthDomain != "" {
t.Errorf("stealthDomain = %q; want empty when stealth is disabled; got %+v", got.stealthDomain, got)
} }
} }
@ -232,16 +252,12 @@ func TestApplyResolvedWebRTCToState_populatesAndReportsChange(t *testing.T) {
t.Errorf("state not fully populated: %+v", st) t.Errorf("state not fully populated: %+v", st)
} }
// The whole point: a SECOND boot now reads the secret from state and must // The whole point of caching: on a SECOND boot where the DB read fails
// NOT consult the DB (chooseRestoreWebRTC short-circuits). // (slow node, namespace rqlite not synced), the cached secret lets the
dbCalled := false // gateway still come up on TURN (DB-first falls back to the cache).
got := chooseRestoreWebRTC(st.HasSFU, st.SFUSignalingPort, st.TURNDomain, st.TURNSharedSecret, st.TURNStealthDomain, got := chooseRestoreWebRTC(st.HasSFU, st.SFUSignalingPort, st.TURNDomain, st.TURNSharedSecret, st.TURNStealthDomain, dbError)
func() (string, string, string, int, bool) { dbCalled = true; return dbError() })
if dbCalled {
t.Error("BUG #130: cold start still hit the DB even though the secret was cached in local state")
}
if !got.enabled || got.unresolved || got.turnSecret != "sek-123" { if !got.enabled || got.unresolved || got.turnSecret != "sek-123" {
t.Errorf("cached cold start should resolve enabled from state; got %+v", got) t.Errorf("cached cold start should fall back to the state secret on a DB error; got %+v", got)
} }
} }
@ -264,3 +280,97 @@ func TestApplyResolvedWebRTCToState_turnOnlyNode_noSFU(t *testing.T) {
t.Errorf("turn-only node: want HasTURN=true HasSFU=false secret cached; got %+v", st) t.Errorf("turn-only node: want HasTURN=true HasSFU=false secret cached; got %+v", st)
} }
} }
func TestApplyResolvedWebRTCToState_clearsCacheOnDisable(t *testing.T) {
// When the DB resolves the namespace as DISABLED, the caller applies an
// empty restoreWebRTC to wipe any stale cached secret from local state — so
// a node that was offline during DisableWebRTC can't later fall back to the
// old secret on a transient DB error and resurrect TURN for a disabled
// namespace. Must report change=true and zero out the cached fields.
st := &ClusterLocalState{HasTURN: true, HasSFU: true, TURNSharedSecret: "stale-secret", TURNDomain: "turn.ns-x.dbrs.space", SFUSignalingPort: 7800}
if !applyResolvedWebRTCToState(st, restoreWebRTC{}) {
t.Fatal("disable: want change=true when clearing a cached secret")
}
if st.TURNSharedSecret != "" || st.HasTURN || st.HasSFU || st.SFUSignalingPort != 0 || st.TURNDomain != "" {
t.Errorf("cache not fully cleared on disable: %+v", st)
}
}
func TestApplyResolvedWebRTCToState_secretRotationReportsChange(t *testing.T) {
// Secret rotation: the state holds an OLD cached secret and a fresh resolve
// brings the NEW (rotated) secret. applyResolvedWebRTCToState MUST report
// change=true and overwrite the cache, so the node's fallback secret tracks
// the rotation instead of persisting a stale value on disk (bugboard #130
// follow-up — the cache must never lag the rotated secret).
st := &ClusterLocalState{HasTURN: true, TURNSharedSecret: "old-secret", TURNDomain: "turn.ns-x.dbrs.space"}
wr := restoreWebRTC{enabled: true, turnSecret: "new-secret", turnDomain: "turn.ns-x.dbrs.space"}
if !applyResolvedWebRTCToState(st, wr) {
t.Fatal("rotation: want change=true when the resolved secret differs from the cached one")
}
if st.TURNSharedSecret != "new-secret" {
t.Errorf("cache not updated to the rotated secret: got %q; want new-secret", st.TURNSharedSecret)
}
}
// ----------------------------------------------------------------------------
// Bugboard #130 — the cold-start read retries so a slow node's namespace
// rqlite read lands once the follower syncs, instead of failing once and
// coming up with TURN disabled.
// ----------------------------------------------------------------------------
func TestResolveWebRTCConfigWithRetry_succeedsOnNthAttempt(t *testing.T) {
// The read errors on the first two attempts (rqlite not readable yet) then
// succeeds — the retry must return the config and not surface the earlier
// transient errors.
calls := 0
slept := 0
cfg, err := resolveWebRTCConfigWithRetry(5, time.Millisecond, func(time.Duration) { slept++ },
func() (*WebRTCConfig, error) {
calls++
if calls < 3 {
return nil, errors.New("rqlite not readable yet")
}
return &WebRTCConfig{TURNSharedSecret: "sek-123"}, nil
})
if err != nil {
t.Fatalf("want success on the 3rd attempt; got err %v", err)
}
if cfg == nil || cfg.TURNSharedSecret != "sek-123" {
t.Fatalf("want resolved config; got %+v", cfg)
}
if calls != 3 {
t.Errorf("want exactly 3 fetch attempts; got %d", calls)
}
if slept != 2 {
t.Errorf("want a sleep between each of the 2 failed attempts; got %d", slept)
}
}
func TestResolveWebRTCConfigWithRetry_exhaustsAndReturnsError(t *testing.T) {
// A persistent error (e.g. a decrypt failure after cluster-secret rotation)
// must exhaust all attempts and return the final error — the caller maps
// that to unresolved (NOT disabled). No sleep after the final attempt.
calls := 0
slept := 0
cfg, err := resolveWebRTCConfigWithRetry(4, time.Millisecond, func(time.Duration) { slept++ },
func() (*WebRTCConfig, error) {
calls++
return nil, errors.New("decrypt failed")
})
if err == nil {
t.Fatal("want the final error after exhausting retries; got nil")
}
if cfg != nil {
t.Errorf("want nil config on exhaustion; got %+v", cfg)
}
if calls != 4 {
t.Errorf("want 4 attempts (all retries used); got %d", calls)
}
if slept != 3 {
t.Errorf("want a sleep between attempts but not after the last; got %d", slept)
}
}

View File

@ -801,16 +801,24 @@ func (s *SystemdSpawner) SaveClusterState(namespace string, data []byte) error {
return fmt.Errorf("failed to create namespace dir: %w", err) return fmt.Errorf("failed to create namespace dir: %w", err)
} }
path := filepath.Join(dir, "cluster-state.json") path := filepath.Join(dir, "cluster-state.json")
// 0600 + chmod: cluster-state.json carries the namespace TURN shared secret // Atomic write to a temp file + rename: cluster-state.json carries the
// for cold-start resilience (bugboard #130), so it must not be world/group // namespace TURN shared secret (bugboard #130), so it must not be
// readable on the receiving node either. WriteFile's mode only applies on // world/group readable on the receiving node either, and a reader must
// create, so chmod explicitly to tighten a file an older release wrote 0644. // never see a half-written secret. 0600 + chmod on the temp file keeps the
if err := os.WriteFile(path, data, 0600); err != nil { // secret private; the rename then makes the live file 0600 too, tightening
return fmt.Errorf("failed to write cluster state: %w", err) // a file an older release wrote 0644.
tmp := path + ".tmp"
if err := os.WriteFile(tmp, data, 0600); err != nil {
return fmt.Errorf("failed to write temp cluster state: %w", err)
} }
if err := os.Chmod(path, 0600); err != nil { if err := os.Chmod(tmp, 0600); err != nil {
os.Remove(tmp)
return fmt.Errorf("failed to set cluster state permissions: %w", err) return fmt.Errorf("failed to set cluster state permissions: %w", err)
} }
if err := os.Rename(tmp, path); err != nil {
os.Remove(tmp)
return fmt.Errorf("failed to rename cluster state into place: %w", err)
}
s.logger.Info("Saved cluster state from coordinator", s.logger.Info("Saved cluster state from coordinator",
zap.String("namespace", namespace), zap.String("namespace", namespace),
zap.String("path", path)) zap.String("path", path))

View File

@ -23,12 +23,14 @@ package ntfy
import ( import (
"context" "context"
"crypto/tls"
"encoding/json" "encoding/json"
"fmt" "fmt"
"io" "io"
"net/http" "net/http"
"net/url" "net/url"
"strings" "strings"
"sync"
"time" "time"
"github.com/DeBrosOfficial/network/pkg/push" "github.com/DeBrosOfficial/network/pkg/push"
@ -45,6 +47,23 @@ type Config struct {
AuthToken string AuthToken string
// Timeout bounds each Send call. 0 selects 5 seconds. // Timeout bounds each Send call. 0 selects 5 seconds.
Timeout time.Duration Timeout time.Duration
// FanoutResolver, when set, returns the set of ntfy publish base URLs to
// deliver EACH publish to — one per active push node. The cluster runs an
// independent ntfy per node with NO shared message store, while subscribers
// are scattered across nodes by round-robin DNS; a publish that lands on one
// node only reaches subscribers on that node, losing ~(N-1)/N (bugboard
// #858). Fanning a publish to EVERY node guarantees it reaches whichever
// instance the subscriber's connection landed on. When nil, or it returns no
// hosts (or errors), Send falls back to the single BaseURL — so push never
// breaks if node discovery is unavailable.
FanoutResolver func(ctx context.Context) ([]string, error)
// FanoutHostHeader, when set, overrides the HTTP Host header and TLS SNI on
// fan-out requests. Needed because FanoutResolver returns per-node addresses
// (IPs) but each node's reverse proxy (Caddy) routes by — and serves its TLS
// cert for — the public push hostname. Empty: no override (tests /
// homogeneous hosts).
FanoutHostHeader string
} }
// Provider is the ntfy push.PushProvider implementation. // Provider is the ntfy push.PushProvider implementation.
@ -52,6 +71,9 @@ type Provider struct {
baseURL string baseURL string
authToken string authToken string
httpClient *http.Client httpClient *http.Client
fanoutClient *http.Client
fanoutResolver func(ctx context.Context) ([]string, error)
fanoutHostHeader string
logger *zap.Logger logger *zap.Logger
} }
@ -64,18 +86,37 @@ func New(cfg Config, logger *zap.Logger) *Provider {
if timeout <= 0 { if timeout <= 0 {
timeout = 5 * time.Second timeout = 5 * time.Second
} }
return &Provider{ p := &Provider{
baseURL: strings.TrimRight(cfg.BaseURL, "/"), baseURL: strings.TrimRight(cfg.BaseURL, "/"),
authToken: cfg.AuthToken, authToken: cfg.AuthToken,
httpClient: &http.Client{Timeout: timeout}, httpClient: &http.Client{Timeout: timeout},
fanoutResolver: cfg.FanoutResolver,
fanoutHostHeader: cfg.FanoutHostHeader,
logger: logger.Named("ntfy"), logger: logger.Named("ntfy"),
} }
if cfg.FanoutResolver != nil {
// Fan-out requests dial per-node addresses but must present the public
// push hostname for SNI so each node's Caddy serves the right cert and
// routes to its local ntfy. A dedicated client carries that fixed SNI.
tr := &http.Transport{}
if cfg.FanoutHostHeader != "" {
tr.TLSClientConfig = &tls.Config{ServerName: cfg.FanoutHostHeader}
}
p.fanoutClient = &http.Client{Timeout: timeout, Transport: tr}
}
return p
} }
// Name implements push.PushProvider. // Name implements push.PushProvider.
func (p *Provider) Name() string { return "ntfy" } func (p *Provider) Name() string { return "ntfy" }
// Send delivers a push notification to the device's ntfy topic. // Send delivers a push notification to the device's ntfy topic.
//
// When a FanoutResolver is configured, the publish is delivered to EVERY active
// push node (the ntfy instances don't share state, so the subscriber's instance
// — whichever the round-robin LB picked — must be among the targets), and Send
// succeeds as long as at least one instance accepted it (bugboard #858).
// Otherwise it publishes to the single configured BaseURL.
func (p *Provider) Send(ctx context.Context, msg push.PushMessage) error { func (p *Provider) Send(ctx context.Context, msg push.PushMessage) error {
if msg.DeviceToken == "" { if msg.DeviceToken == "" {
return push.ErrEmptyToken return push.ErrEmptyToken
@ -84,7 +125,7 @@ func (p *Provider) Send(ctx context.Context, msg push.PushMessage) error {
return fmt.Errorf("ntfy: base URL not configured") return fmt.Errorf("ntfy: base URL not configured")
} }
endpointURL, err := p.resolveEndpoint(msg.DeviceToken) topic, err := p.resolveTopic(msg.DeviceToken)
if err != nil { if err != nil {
return err return err
} }
@ -102,10 +143,73 @@ func (p *Provider) Send(ctx context.Context, msg push.PushMessage) error {
body = string(b) body = string(b)
} }
// Resolve the set of base URLs to publish to. Default: the single base URL.
// With a fan-out resolver, publish to every active push node so the
// subscriber's instance is always covered. Resolver failure is non-fatal —
// fall back to the base URL so push keeps working.
bases := []string{p.baseURL}
httpClient := p.httpClient
hostHeader := ""
if p.fanoutResolver != nil {
if hosts, rerr := p.fanoutResolver(ctx); rerr != nil {
p.logger.Warn("ntfy fan-out node resolution failed; publishing to base URL only", zap.Error(rerr))
} else if len(hosts) > 0 {
bases = hosts
httpClient = p.fanoutClient
hostHeader = p.fanoutHostHeader
}
}
if len(bases) == 1 {
return p.postOne(ctx, httpClient, bases[0], topic, body, msg, hostHeader)
}
// Fan out concurrently. Success = at least one instance accepted the
// publish (the message is in the cluster). A node that's down is logged but
// does not fail the Send, since the message still reaches every reachable
// instance — including, in the common case, the subscriber's.
var wg sync.WaitGroup
errs := make([]error, len(bases))
for i, base := range bases {
wg.Add(1)
go func(i int, base string) {
defer wg.Done()
errs[i] = p.postOne(ctx, httpClient, base, topic, body, msg, hostHeader)
}(i, base)
}
wg.Wait()
okCount := 0
var firstErr error
for _, e := range errs {
if e == nil {
okCount++
} else if firstErr == nil {
firstErr = e
}
}
if okCount == 0 {
return fmt.Errorf("ntfy: fan-out to all %d push nodes failed: %w", len(bases), firstErr)
}
if okCount < len(bases) {
p.logger.Warn("ntfy fan-out partial failure (message still delivered to the reachable instances)",
zap.Int("delivered", okCount), zap.Int("total", len(bases)), zap.Error(firstErr))
}
return nil
}
// postOne publishes a single (already-resolved) topic+body to one ntfy base URL.
// hostHeader, when non-empty, overrides the HTTP Host header so a request dialed
// at a node IP is still routed by the node's proxy as the public push hostname.
func (p *Provider) postOne(ctx context.Context, httpClient *http.Client, base, topic, body string, msg push.PushMessage, hostHeader string) error {
endpointURL := strings.TrimRight(base, "/") + "/" + topic
req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpointURL, strings.NewReader(body)) req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpointURL, strings.NewReader(body))
if err != nil { if err != nil {
return fmt.Errorf("ntfy: build request: %w", err) return fmt.Errorf("ntfy: build request: %w", err)
} }
if hostHeader != "" {
req.Host = hostHeader
}
if msg.Title != "" { if msg.Title != "" {
req.Header.Set("Title", msg.Title) req.Header.Set("Title", msg.Title)
@ -127,15 +231,15 @@ func (p *Provider) Send(ctx context.Context, msg push.PushMessage) error {
req.Header.Set("Authorization", "Bearer "+p.authToken) req.Header.Set("Authorization", "Bearer "+p.authToken)
} }
resp, err := p.httpClient.Do(req) resp, err := httpClient.Do(req)
if err != nil { if err != nil {
return fmt.Errorf("ntfy: post: %w", err) return fmt.Errorf("ntfy: post: %w", err)
} }
defer resp.Body.Close() defer resp.Body.Close()
if resp.StatusCode >= 400 { if resp.StatusCode >= 400 {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 512)) errBody, _ := io.ReadAll(io.LimitReader(resp.Body, 512))
return fmt.Errorf("ntfy: http %d: %s", resp.StatusCode, strings.TrimSpace(string(body))) return fmt.Errorf("ntfy: http %d: %s", resp.StatusCode, strings.TrimSpace(string(errBody)))
} }
// Drain body to allow connection reuse. // Drain body to allow connection reuse.
@ -143,20 +247,21 @@ func (p *Provider) Send(ctx context.Context, msg push.PushMessage) error {
return nil return nil
} }
// resolveEndpoint maps a device token to the ntfy publish URL. // resolveTopic maps a device token to the escaped ntfy topic path (without the
// base URL), so the same topic can be published to one or many push nodes.
// //
// The token is one of two shapes: // The token is one of two shapes:
// //
// - A plain ntfy topic (possibly hierarchical, e.g. "ns/myapp/user-1") — // - A plain ntfy topic (possibly hierarchical, e.g. "ns/myapp/user-1") —
// published to "<baseURL>/<topic>", with each path segment escaped so a // each path segment is escaped so a crafted token can't break out of the
// crafted token can't break out of the topic path. // topic path.
// - A full UnifiedPush endpoint URL handed to the client by the ntfy // - A full UnifiedPush endpoint URL handed to the client by the ntfy
// distributor (e.g. "https://push.example.com/up<random>"). UnifiedPush // distributor (e.g. "https://push.example.com/up<random>"). UnifiedPush
// requires the application server to POST to that endpoint verbatim, so we // requires the application server to POST to that endpoint, so we accept it
// use it as-is — but ONLY after verifying its scheme+host match the // — but ONLY after verifying its scheme+host match the configured base URL,
// configured base URL. That check turns a device-supplied token into an // then take only its path as the topic. That turns a device-supplied token
// SSRF only against our own push host, never an arbitrary one. // into a publish only against our own push host, never an arbitrary one.
func (p *Provider) resolveEndpoint(token string) (string, error) { func (p *Provider) resolveTopic(token string) (string, error) {
topic := token topic := token
if isAbsoluteHTTPURL(token) { if isAbsoluteHTTPURL(token) {
u, err := url.Parse(token) u, err := url.Parse(token)
@ -173,10 +278,7 @@ func (p *Provider) resolveEndpoint(token string) (string, error) {
return "", fmt.Errorf("ntfy: endpoint host %q does not match configured push host %q", u.Host, base.Host) return "", fmt.Errorf("ntfy: endpoint host %q does not match configured push host %q", u.Host, base.Host)
} }
// Confine the URL form to the SAME publish surface as a bare topic: // Confine the URL form to the SAME publish surface as a bare topic:
// take only the path as the topic and re-build through the per-segment // take only the path as the topic, dropping any query/fragment.
// escaping below, dropping any query/fragment. So a UnifiedPush
// endpoint token can publish a topic but can't gain arbitrary path or
// query control on the push host beyond what a plain topic already has.
topic = strings.TrimPrefix(u.Path, "/") topic = strings.TrimPrefix(u.Path, "/")
if topic == "" { if topic == "" {
return "", fmt.Errorf("ntfy: endpoint url %q has no topic path", token) return "", fmt.Errorf("ntfy: endpoint url %q has no topic path", token)
@ -188,7 +290,7 @@ func (p *Provider) resolveEndpoint(token string) (string, error) {
for i, seg := range parts { for i, seg := range parts {
parts[i] = url.PathEscape(seg) parts[i] = url.PathEscape(seg)
} }
return p.baseURL + "/" + strings.Join(parts, "/"), nil return strings.Join(parts, "/"), nil
} }
// isAbsoluteHTTPURL reports whether s looks like an absolute http(s) URL (the // isAbsoluteHTTPURL reports whether s looks like an absolute http(s) URL (the

View File

@ -8,6 +8,7 @@ import (
"net/http/httptest" "net/http/httptest"
"net/url" "net/url"
"strings" "strings"
"sync"
"testing" "testing"
"time" "time"
@ -306,3 +307,136 @@ func TestName(t *testing.T) {
t.Errorf("expected Name=ntfy, got %s", p.Name()) t.Errorf("expected Name=ntfy, got %s", p.Name())
} }
} }
// ----------------------------------------------------------------------------
// Bugboard #858 — cluster fan-out. Each push node runs an independent ntfy with
// no shared store, so a publish must reach EVERY node for the subscriber's
// instance (round-robin DNS picks one) to receive it.
// ----------------------------------------------------------------------------
// fanoutRecorder is a test ntfy node that records the topics it received.
type fanoutRecorder struct {
mu sync.Mutex
topics []string
}
func newFanoutNode(t *testing.T) (*httptest.Server, *fanoutRecorder) {
t.Helper()
rec := &fanoutRecorder{}
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
rec.mu.Lock()
rec.topics = append(rec.topics, strings.TrimPrefix(r.URL.Path, "/"))
rec.mu.Unlock()
w.WriteHeader(http.StatusOK)
}))
return srv, rec
}
func (r *fanoutRecorder) count() int {
r.mu.Lock()
defer r.mu.Unlock()
return len(r.topics)
}
func TestSend_fanout_publishesToAllNodes(t *testing.T) {
s1, r1 := newFanoutNode(t)
defer s1.Close()
s2, r2 := newFanoutNode(t)
defer s2.Close()
s3, r3 := newFanoutNode(t)
defer s3.Close()
p := New(Config{
BaseURL: s1.URL, // base URL still required; fan-out targets come from the resolver
FanoutResolver: func(context.Context) ([]string, error) {
return []string{s1.URL, s2.URL, s3.URL}, nil
},
}, nil)
if err := p.Send(context.Background(), push.PushMessage{DeviceToken: "user-1", Body: "hi"}); err != nil {
t.Fatalf("Send: %v", err)
}
for i, r := range []*fanoutRecorder{r1, r2, r3} {
if r.count() != 1 {
t.Errorf("node %d received %d publishes; want exactly 1 (the publish must reach every node)", i+1, r.count())
}
if r.count() == 1 && r.topics[0] != "user-1" {
t.Errorf("node %d got topic %q; want user-1", i+1, r.topics[0])
}
}
}
func TestSend_fanout_oneNodeDown_stillSucceeds(t *testing.T) {
up, rUp := newFanoutNode(t)
defer up.Close()
down, _ := newFanoutNode(t)
down.Close() // unreachable
p := New(Config{
BaseURL: up.URL,
FanoutResolver: func(context.Context) ([]string, error) {
return []string{up.URL, down.URL}, nil
},
}, nil)
// At least one node accepted it → Send succeeds; the message still reached
// the reachable instances.
if err := p.Send(context.Background(), push.PushMessage{DeviceToken: "t", Body: "x"}); err != nil {
t.Fatalf("Send should succeed when at least one node is up; got %v", err)
}
if rUp.count() != 1 {
t.Errorf("the up node should have received the publish; got %d", rUp.count())
}
}
func TestSend_fanout_allNodesDown_returnsError(t *testing.T) {
d1, _ := newFanoutNode(t)
d1.Close()
d2, _ := newFanoutNode(t)
d2.Close()
p := New(Config{
BaseURL: "http://127.0.0.1:1", // unused for posting; just non-empty
FanoutResolver: func(context.Context) ([]string, error) {
return []string{d1.URL, d2.URL}, nil
},
}, nil)
if err := p.Send(context.Background(), push.PushMessage{DeviceToken: "t", Body: "x"}); err == nil {
t.Fatal("Send should fail when every node is unreachable")
}
}
func TestSend_fanout_resolverEmpty_fallsBackToBaseURL(t *testing.T) {
base, rBase := newFanoutNode(t)
defer base.Close()
p := New(Config{
BaseURL: base.URL,
FanoutResolver: func(context.Context) ([]string, error) { return nil, nil }, // no active nodes
}, nil)
if err := p.Send(context.Background(), push.PushMessage{DeviceToken: "t", Body: "x"}); err != nil {
t.Fatalf("Send: %v", err)
}
if rBase.count() != 1 {
t.Errorf("empty resolver must fall back to the base URL; base got %d publishes", rBase.count())
}
}
func TestSend_fanout_resolverError_fallsBackToBaseURL(t *testing.T) {
base, rBase := newFanoutNode(t)
defer base.Close()
p := New(Config{
BaseURL: base.URL,
FanoutResolver: func(context.Context) ([]string, error) { return nil, context.DeadlineExceeded },
}, nil)
if err := p.Send(context.Background(), push.PushMessage{DeviceToken: "t", Body: "x"}); err != nil {
t.Fatalf("resolver error must not fail the push (fall back to base URL); got %v", err)
}
if rBase.count() != 1 {
t.Errorf("resolver error must fall back to the base URL; base got %d publishes", rBase.count())
}
}