Compare commits

..

1 Commits

Author SHA1 Message Date
anonpenguin
8b8f0a4251
Merge pull request #93 from DeBrosDAO/nightly
release: 0.122.47 — nightly → main
2026-06-11 17:37:20 +03:00
43 changed files with 323 additions and 3396 deletions

View File

@ -1 +1 @@
0.122.55
0.122.47

View File

@ -1,15 +0,0 @@
-- =============================================================================
-- 031_refresh_token_custom_claims.sql
--
-- Carry the additive JWT custom claims (e.g. the namespace's account_id from
-- the auth-claims-provider hook, bugboard #548/#920) ALONGSIDE the refresh
-- token, so a rotated access token keeps the same claims without re-invoking
-- the namespace's claims-provider function on every 15-min refresh (the
-- refresh path is the latency-critical VoIP-wake path, bugboard #125).
--
-- Resolved once at /v1/auth/verify mint time, stored here, and replayed +
-- propagated across each rotation. NULL/absent = no custom claims (the default
-- for every namespace without a claims-provider) → fully backward compatible.
-- =============================================================================
ALTER TABLE refresh_tokens ADD COLUMN custom_claims TEXT;

View File

@ -1,20 +0,0 @@
-- 032_refresh_token_reuse_grace.sql
--
-- Bugboard #125: bounded, single-use reuse grace for rotated refresh tokens.
--
-- Refresh-token rotation is single-use: a successful /v1/auth/refresh revokes
-- the presented token and issues a new one. If the rotation RESPONSE is lost
-- in transit (e.g. a reconnect storm during a gateway roll), the client is
-- left holding a just-revoked token and its retry dead-ends in a 401 -> SIWE.
-- On a VoIP-woken locked screen SIWE is impossible, so the call dies.
--
-- grace_used_at lets the gateway accept a just-rotated token ONE more time
-- within a short window (RFC 9700 §4.13.2 reuse grace) and mint a fresh
-- session, while the single-use CAS on this column prevents a stolen token
-- from being replayed repeatedly. NULL = grace not yet consumed.
--
-- Additive ALTER (rolling-upgrade safe): older gateways ignore the column;
-- newer ones read it back NULL for pre-existing rows, which is the correct
-- "grace available" default.
ALTER TABLE refresh_tokens ADD COLUMN grace_used_at TIMESTAMP;

View File

@ -43,10 +43,9 @@ type AuthService interface {
// Verifies signature, expiration, and issuer.
ParseAndVerifyJWT(token string) (*JWTClaims, error)
// GenerateJWT creates a new signed JWT with the specified subject, TTL, and
// optional additive custom claims (nil = none; bugboard #548).
// GenerateJWT creates a new signed JWT with the specified claims and TTL.
// Returns: token, expirationUnix, error.
GenerateJWT(namespace, subject string, ttl time.Duration, custom map[string]string) (string, int64, error)
GenerateJWT(namespace, subject string, ttl time.Duration) (string, int64, error)
// RegisterApp registers a new client application with the gateway.
// Returns an application ID that can be used for OAuth flows.

View File

@ -4,7 +4,6 @@ import (
"crypto/hmac"
"crypto/sha256"
"encoding/hex"
"encoding/json"
)
// sha256Hex returns the lowercase hex-encoded SHA-256 hash of the input string.
@ -23,34 +22,3 @@ func HmacSHA256Hex(data, secret string) string {
mac.Write([]byte(data))
return hex.EncodeToString(mac.Sum(nil))
}
// marshalClaims serializes additive JWT custom claims for storage alongside a
// refresh token (bugboard #548). Empty/nil → "" so the column stays NULL-ish
// and absent claims read back as nil.
func marshalClaims(m map[string]string) string {
if len(m) == 0 {
return ""
}
b, err := json.Marshal(m)
if err != nil {
return ""
}
return string(b)
}
// unmarshalClaims is the inverse of marshalClaims. An empty string or any
// malformed value yields nil (fail-soft — a corrupt claims blob must never
// break token rotation; the token simply rotates without custom claims).
func unmarshalClaims(s string) map[string]string {
if s == "" {
return nil
}
var m map[string]string
if err := json.Unmarshal([]byte(s), &m); err != nil {
return nil
}
if len(m) == 0 {
return nil
}
return m
}

View File

@ -182,22 +182,15 @@ func (s *Service) ParseAndVerifyJWT(token string) (*JWTClaims, error) {
return &claims, nil
}
// GenerateJWT mints a signed access token. `custom` carries additive
// app-defined claims (e.g. the namespace's account_id from the claims-provider
// hook, bugboard #548) under the top-level "custom" object — read back via
// JWTClaims.Custom / oh.GetCallerClaim. Pass nil for none. Reserved claims
// (sub/iss/aud/iat/nbf/exp/namespace) are always gateway-controlled and cannot
// be overridden by `custom` (the caller is responsible for not putting
// reserved keys here; the claims-provider path sanitizes them out upstream).
func (s *Service) GenerateJWT(ns, subject string, ttl time.Duration, custom map[string]string) (string, int64, error) {
func (s *Service) GenerateJWT(ns, subject string, ttl time.Duration) (string, int64, error) {
// Prefer EdDSA when available
if s.preferEdDSA && s.edSigningKey != nil {
return s.generateEdDSAJWT(ns, subject, ttl, custom)
return s.generateEdDSAJWT(ns, subject, ttl)
}
return s.generateRSAJWT(ns, subject, ttl, custom)
return s.generateRSAJWT(ns, subject, ttl)
}
func (s *Service) generateEdDSAJWT(ns, subject string, ttl time.Duration, custom map[string]string) (string, int64, error) {
func (s *Service) generateEdDSAJWT(ns, subject string, ttl time.Duration) (string, int64, error) {
if s.edSigningKey == nil {
return "", 0, errors.New("EdDSA signing key unavailable")
}
@ -218,9 +211,6 @@ func (s *Service) generateEdDSAJWT(ns, subject string, ttl time.Duration, custom
"exp": exp.Unix(),
"namespace": ns,
}
if len(custom) > 0 {
payload["custom"] = custom
}
pb, _ := json.Marshal(payload)
hb64 := base64.RawURLEncoding.EncodeToString(hb)
pb64 := base64.RawURLEncoding.EncodeToString(pb)
@ -230,7 +220,7 @@ func (s *Service) generateEdDSAJWT(ns, subject string, ttl time.Duration, custom
return signingInput + "." + sb64, exp.Unix(), nil
}
func (s *Service) generateRSAJWT(ns, subject string, ttl time.Duration, custom map[string]string) (string, int64, error) {
func (s *Service) generateRSAJWT(ns, subject string, ttl time.Duration) (string, int64, error) {
if s.signingKey == nil {
return "", 0, errors.New("signing key unavailable")
}
@ -251,9 +241,6 @@ func (s *Service) generateRSAJWT(ns, subject string, ttl time.Duration, custom m
"exp": exp.Unix(),
"namespace": ns,
}
if len(custom) > 0 {
payload["custom"] = custom
}
pb, _ := json.Marshal(payload)
hb64 := base64.RawURLEncoding.EncodeToString(hb)
pb64 := base64.RawURLEncoding.EncodeToString(pb)

View File

@ -31,19 +31,8 @@ type rotationMockORMDB struct {
client.DatabaseClient
mu sync.Mutex
subjectByToken map[string]string // hashedToken -> subject (nil/missing = "invalid")
claimsByToken map[string]string // hashedToken -> custom_claims JSON (bugboard #548)
// graceableTokens: hashedToken -> subject for tokens that are revoked but
// still inside the reuse-grace window (bugboard #125). The grace SELECT
// (detected by the grace_used_at predicate) reads from here.
graceableTokens map[string]string
inserted int // count of INSERTs (new refresh-token rows)
subjects map[string]string // subject -> last hashed token inserted
// selectErrRemaining: number of upcoming "SELECT subject" calls that
// should return selectErr (simulates a transient rqlite leader outage).
// Decremented per matching call; 0 = serve normally (bugboard #125).
selectErr error
selectErrRemaining int
selectAttemptsTaken int
}
func (m *rotationMockORMDB) Query(_ context.Context, sql string, args ...interface{}) (*client.QueryResult, error) {
@ -56,58 +45,17 @@ func (m *rotationMockORMDB) Query(_ context.Context, sql string, args ...interfa
if containsCI(sql, "SELECT id FROM namespaces") {
return &client.QueryResult{Count: 1, Rows: [][]interface{}{{int64(1)}}}, nil
}
// Grace-path SELECT (bugboard #125): SELECT subject for a recently-revoked,
// grace-available token. Distinguished from the active-path SELECT by the
// grace_used_at predicate. Must be checked BEFORE the generic handler.
if containsCI(sql, "SELECT subject") && containsCI(sql, "FROM refresh_tokens") && containsCI(sql, "grace_used_at") {
if len(args) < 2 {
return &client.QueryResult{Count: 0}, nil
}
hashedTok, _ := args[1].(string)
if subj, ok := m.graceableTokens[hashedTok]; ok && subj != "" {
claims := ""
if m.claimsByToken != nil {
claims = m.claimsByToken[hashedTok]
}
return &client.QueryResult{Count: 1, Rows: [][]interface{}{{subj, claims}}}, nil
}
return &client.QueryResult{Count: 0}, nil
}
// SELECT subject (+ custom_claims, bugboard #548) for the lookup.
if containsCI(sql, "SELECT subject") && containsCI(sql, "FROM refresh_tokens") {
m.selectAttemptsTaken++
if m.selectErrRemaining > 0 {
m.selectErrRemaining--
return nil, m.selectErr
}
// SELECT subject for the refresh-token lookup.
if containsCI(sql, "SELECT subject FROM refresh_tokens") {
if len(args) < 2 {
return &client.QueryResult{Count: 0}, nil
}
hashedTok, _ := args[1].(string)
if subj, ok := m.subjectByToken[hashedTok]; ok && subj != "" {
claims := ""
if m.claimsByToken != nil {
claims = m.claimsByToken[hashedTok]
}
return &client.QueryResult{Count: 1, Rows: [][]interface{}{{subj, claims}}}, nil
return &client.QueryResult{Count: 1, Rows: [][]interface{}{{subj}}}, nil
}
return &client.QueryResult{Count: 0}, nil
}
// RevokeToken UPDATE that ALSO burns the grace slot (bugboard #125
// logout-bypass fix). Reflect it by clearing the token's grace eligibility
// so a follow-on grace SELECT misses it. (The rotation grace CAS goes
// through the rqlite Exec mock, not here, so there's no collision.)
if containsCI(sql, "UPDATE refresh_tokens") && containsCI(sql, "grace_used_at") && len(args) >= 2 {
if key, ok := args[1].(string); ok && m.graceableTokens != nil {
delete(m.graceableTokens, key) // single-token: key is the hashed token
for tok, subj := range m.graceableTokens {
if subj == key { // revoke-all: key is the subject
delete(m.graceableTokens, tok)
}
}
}
return &client.QueryResult{Count: 1}, nil
}
// INSERT new refresh_tokens row.
if containsCI(sql, "INSERT INTO refresh_tokens") {
m.inserted++
@ -123,14 +71,6 @@ func (m *rotationMockORMDB) Query(_ context.Context, sql string, args ...interfa
m.subjectByToken = map[string]string{}
}
m.subjectByToken[hashedTok] = subj
// custom_claims is the LAST arg (bugboard #548) — capture it so
// rotation-propagation tests can assert it carries forward.
if m.claimsByToken == nil {
m.claimsByToken = map[string]string{}
}
if cc, ok := args[len(args)-1].(string); ok {
m.claimsByToken[hashedTok] = cc
}
}
return &client.QueryResult{Count: 1}, nil
}
@ -149,12 +89,6 @@ type rotationMockRqlite struct {
rowsAffectedNext []int64 // programmable per-call values; pop from front. Defaults to "revoke if unrevoked".
execErrNext []error // programmable per-call errors
parallelExecGuard sync.Mutex
// graceCASNext: programmable RowsAffected for the grace CAS (UPDATE ... SET
// grace_used_at). 1 = won the single-use grace; 0 = already consumed
// (bugboard #125). Defaults to "win once per token".
graceCASNext []int64
graceConsumed map[string]bool
graceCASCalls int
}
func (m *rotationMockRqlite) Exec(_ context.Context, sql string, args ...interface{}) (sql.Result, error) {
@ -175,29 +109,6 @@ func (m *rotationMockRqlite) Exec(_ context.Context, sql string, args ...interfa
}
}
// Grace CAS (bugboard #125): UPDATE ... SET grace_used_at, single-use.
if containsCI(sql, "SET grace_used_at") && len(args) >= 2 {
m.graceCASCalls++
hashedTok, _ := args[1].(string)
if m.graceConsumed == nil {
m.graceConsumed = map[string]bool{}
}
var affected int64
if len(m.graceCASNext) > 0 {
affected = m.graceCASNext[0]
m.graceCASNext = m.graceCASNext[1:]
if affected == 1 {
m.graceConsumed[hashedTok] = true
}
} else if !m.graceConsumed[hashedTok] {
m.graceConsumed[hashedTok] = true
affected = 1
} else {
affected = 0
}
return &rotationFakeResult{affected: affected}, nil
}
// Default UPDATE behavior: matches if token is currently unrevoked.
if containsCI(sql, "UPDATE refresh_tokens SET revoked_at") && len(args) >= 2 {
hashedTok, _ := args[1].(string)
@ -458,233 +369,3 @@ func TestRefreshToken_RotatedTokenReplayFails(t *testing.T) {
t.Fatal("expected error reusing rotated token, got nil")
}
}
// Bugboard #125: a TRANSIENT rqlite error on the lookup (leader briefly
// unavailable during a rolling restart) must surface as ErrRefreshTransient
// (→ 503, retryable) — NOT "invalid or expired" (→ 401, full SIWE re-auth,
// impossible on a locked device answering a VoIP-woken call).
func TestRefreshToken_transientSelectError_returnsTransient(t *testing.T) {
s, ormDB, _ := newRotationTestService(t)
const refresh = "valid-but-leader-down"
ormDB.subjectByToken[sha256Hex(refresh)] = "0xWALLET"
// Every lookup attempt across the whole retry window errors.
ormDB.selectErr = errors.New("rqlite: leadership lost")
ormDB.selectErrRemaining = 99
_, _, _, _, err := s.RefreshToken(context.Background(), refresh, "anchat-test")
if !errors.Is(err, ErrRefreshTransient) {
t.Fatalf("err = %v, want ErrRefreshTransient (a valid token must not 401 during a leader outage)", err)
}
}
// The lookup is retried, so a brief blip recovers transparently within one
// refresh call (no client-visible failure at all).
func TestRefreshToken_selectRecoversAfterRetry(t *testing.T) {
s, ormDB, _ := newRotationTestService(t)
const refresh = "valid-blips-then-ok"
ormDB.subjectByToken[sha256Hex(refresh)] = "0xWALLET"
ormDB.selectErr = errors.New("rqlite: leadership lost")
ormDB.selectErrRemaining = refreshSelectRetries - 1 // fail all but the last attempt
access, newRefresh, subj, _, err := s.RefreshToken(context.Background(), refresh, "anchat-test")
if err != nil {
t.Fatalf("RefreshToken should recover after transient blips: %v", err)
}
if access == "" || newRefresh == "" || subj != "0xWALLET" {
t.Errorf("recovered refresh incomplete: access=%q newRefresh=%q subj=%q", access, newRefresh, subj)
}
}
// A transient error on the CAS write (revoke) is also retryable, not a 401.
func TestRefreshToken_transientUpdateError_returnsTransient(t *testing.T) {
s, ormDB, rq := newRotationTestService(t)
const refresh = "valid-cas-write-down"
ormDB.subjectByToken[sha256Hex(refresh)] = "0xWALLET"
rq.execErrNext = []error{errors.New("rqlite: write failed, no leader")}
_, _, _, _, err := s.RefreshToken(context.Background(), refresh, "anchat-test")
if !errors.Is(err, ErrRefreshTransient) {
t.Fatalf("err = %v, want ErrRefreshTransient on a transient CAS write error", err)
}
}
// A genuinely unknown token must remain a hard invalid (401), NOT be masked as
// transient — the distinction is the whole point of the #125 fix.
func TestRefreshToken_unknownToken_isNotTransient(t *testing.T) {
s, _, _ := newRotationTestService(t)
_, _, _, _, err := s.RefreshToken(context.Background(), "never-existed", "anchat-test")
if err == nil {
t.Fatal("expected error for unknown token")
}
if errors.Is(err, ErrRefreshTransient) {
t.Errorf("unknown token must be a genuine invalid (401), not transient (503): %v", err)
}
}
// mockClaimsResolver is a fixed claims-provider stand-in for the mint tests.
type mockClaimsResolver struct{ claims map[string]string }
func (m mockClaimsResolver) ResolveClaims(_ context.Context, _, _ string) map[string]string {
return m.claims
}
// Bugboard #548: claims resolved at IssueTokens (login) must be stored with the
// refresh token AND replayed into the rotated access token — so account_id
// survives the 15-min refresh without re-invoking the provider.
func TestRefreshToken_propagatesCustomClaims(t *testing.T) {
s, ormDB, _ := newRotationTestService(t)
s.SetClaimsResolver(mockClaimsResolver{claims: map[string]string{"account_id": "u-999"}})
// Login mint — IssueTokens resolves + stores the claims with the refresh row.
_, refresh, _, err := s.IssueTokens(context.Background(), "0xWALLET", "anchat-test")
if err != nil {
t.Fatalf("IssueTokens: %v", err)
}
if got := ormDB.claimsByToken[sha256Hex(refresh)]; got != `{"account_id":"u-999"}` {
t.Fatalf("claims not stored with refresh token; got %q", got)
}
// Refresh — the rotated access token must carry account_id, and the NEW
// refresh row must propagate the stored claims.
access, newRefresh, _, _, err := s.RefreshToken(context.Background(), refresh, "anchat-test")
if err != nil {
t.Fatalf("RefreshToken: %v", err)
}
claims, err := s.ParseAndVerifyJWT(access)
if err != nil {
t.Fatalf("ValidateJWT: %v", err)
}
if claims.Custom["account_id"] != "u-999" {
t.Errorf("rotated access token lost account_id; custom=%v", claims.Custom)
}
if got := ormDB.claimsByToken[sha256Hex(newRefresh)]; got != `{"account_id":"u-999"}` {
t.Errorf("rotation did not propagate claims to the new row; got %q", got)
}
// Second rotation hop (N+1 → N+2): the claim must survive repeated
// rotations, not just the first — the propagation is the whole point.
access2, _, _, _, err := s.RefreshToken(context.Background(), newRefresh, "anchat-test")
if err != nil {
t.Fatalf("second RefreshToken: %v", err)
}
claims2, err := s.ParseAndVerifyJWT(access2)
if err != nil {
t.Fatalf("ParseAndVerifyJWT (2nd): %v", err)
}
if claims2.Custom["account_id"] != "u-999" {
t.Errorf("account_id lost across the second rotation; custom=%v", claims2.Custom)
}
}
// ----------------------------------------------------------------------------
// Bugboard #125 — bounded, single-use refresh-token reuse grace (RFC 9700
// §4.13.2). A rotation response lost in transit must NOT dead-end in a 401.
// ----------------------------------------------------------------------------
// A just-rotated token (revoked, within grace, grace not consumed) is accepted
// ONCE more and mints a fresh session — recovering a client whose rotation
// response was lost. The revoke CAS is skipped (the token is already revoked),
// so this must NOT surface the replay tripwire.
func TestRefreshToken_reuseGrace_recoversLostResponse(t *testing.T) {
s, ormDB, rq := newRotationTestService(t)
const lostTok = "rotated-but-response-lost"
// NOT in the active set (already revoked) ...
// ... but eligible for grace (revoked recently, grace unused).
ormDB.graceableTokens = map[string]string{sha256Hex(lostTok): "0xWALLET"}
access, newRefresh, subj, exp, err := s.RefreshToken(context.Background(), lostTok, "anchat-test")
if err != nil {
t.Fatalf("grace recovery should succeed, got error: %v", err)
}
if access == "" || newRefresh == "" {
t.Error("grace recovery must mint a fresh access + refresh token")
}
if newRefresh == lostTok {
t.Error("grace recovery must rotate to a NEW refresh token")
}
if subj != "0xWALLET" {
t.Errorf("subject = %q, want 0xWALLET", subj)
}
if exp <= 0 {
t.Errorf("expiration not set: %d", exp)
}
// The single-use grace CAS must have been claimed exactly once.
if rq.graceCASCalls != 1 {
t.Errorf("grace CAS calls = %d, want 1", rq.graceCASCalls)
}
// And a fresh refresh-token row was inserted.
if ormDB.inserted != 1 {
t.Errorf("expected 1 INSERT for the recovered session, got %d", ormDB.inserted)
}
}
// The grace is SINGLE-USE: once the grace_used_at CAS is lost (already
// consumed, e.g. a replay after the legitimate client already recovered), the
// token must 401 — a stolen token cannot be replayed at leisure.
func TestRefreshToken_reuseGrace_singleUse_secondAttemptIs401(t *testing.T) {
s, ormDB, rq := newRotationTestService(t)
const tok = "already-grace-consumed"
ormDB.graceableTokens = map[string]string{sha256Hex(tok): "0xWALLET"}
// Force the grace CAS to report "already consumed".
rq.graceCASNext = []int64{0}
_, _, _, _, err := s.RefreshToken(context.Background(), tok, "anchat-test")
if err == nil {
t.Fatal("a consumed grace must NOT recover — expected an invalid-token error")
}
if !containsCI(err.Error(), "invalid or expired") {
t.Errorf("want invalid/expired 401, got %v", err)
}
if ormDB.inserted != 0 {
t.Errorf("no new session should be minted when grace is consumed; inserts=%d", ormDB.inserted)
}
}
// A genuinely bad token (not active AND not grace-eligible) still 401s — the
// grace path must not turn unknown tokens into sessions.
func TestRefreshToken_noGrace_genuineBadToken_stays401(t *testing.T) {
s, ormDB, _ := newRotationTestService(t)
// graceableTokens left empty: nothing is grace-eligible.
_, _, _, _, err := s.RefreshToken(context.Background(), "never-seen-this-token", "anchat-test")
if err == nil {
t.Fatal("a never-seen token must be rejected")
}
if !containsCI(err.Error(), "invalid or expired") {
t.Errorf("want invalid/expired 401, got %v", err)
}
if ormDB.inserted != 0 {
t.Errorf("no session should be minted for a bad token; inserts=%d", ormDB.inserted)
}
}
// Security regression (bugboard #125 logout-bypass): a token explicitly revoked
// via RevokeToken (logout) must NOT be recoverable through the reuse grace, even
// within the 60s window. RevokeToken burns grace_used_at so the grace predicate
// (grace_used_at IS NULL) excludes it.
func TestRevokeToken_burnsGrace_blocksLogoutBypass(t *testing.T) {
s, ormDB, _ := newRotationTestService(t)
const tok = "logged-out-token"
// Within the revoke window it WOULD be grace-eligible...
ormDB.graceableTokens = map[string]string{sha256Hex(tok): "0xWALLET"}
// ...until the user logs out.
if err := s.RevokeToken(context.Background(), "anchat-test", tok, false, ""); err != nil {
t.Fatalf("RevokeToken: %v", err)
}
// A refresh with the just-logged-out token must be rejected, not resurrected.
_, _, _, _, err := s.RefreshToken(context.Background(), tok, "anchat-test")
if err == nil {
t.Fatal("LOGOUT-BYPASS: a logged-out token was resurrected via reuse grace")
}
if !containsCI(err.Error(), "invalid or expired") {
t.Errorf("want 401 invalid/expired, got %v", err)
}
if ormDB.inserted != 0 {
t.Errorf("no session should be minted for a logged-out token; inserts=%d", ormDB.inserted)
}
}

View File

@ -35,8 +35,7 @@ type Service struct {
edKeyID string
preferEdDSA bool
defaultNS string
apiKeyHMACSecret string // HMAC secret for hashing API keys before storage
claimsResolver ClaimsResolver // namespace claims-provider hook (bugboard #548); nil = none
apiKeyHMACSecret string // HMAC secret for hashing API keys before storage
}
func NewService(logger *logging.ColoredLogger, orm client.NetworkClient, signingKeyPEM string, defaultNS string) (*Service, error) {
@ -85,28 +84,6 @@ func (s *Service) SetRqliteClient(db rqlite.Client) {
s.db = db
}
// ClaimsResolver resolves additive, namespace-defined JWT custom claims for an
// authenticated wallet at token-mint time (bugboard #548/#920). The concrete
// implementation invokes the namespace's reserved `auth-claims-provider`
// serverless function; it MUST be fail-open (return nil, never error) so a
// missing/slow/broken provider never breaks authentication. Injected via
// SetClaimsResolver; nil = no custom claims (every namespace's default).
type ClaimsResolver interface {
ResolveClaims(ctx context.Context, wallet, namespace string) map[string]string
}
// SetClaimsResolver wires the namespace claims-provider hook used at mint time.
func (s *Service) SetClaimsResolver(r ClaimsResolver) { s.claimsResolver = r }
// resolveCustomClaims returns the namespace's additive claims for this wallet,
// or nil. Fail-open by contract — the resolver never errors.
func (s *Service) resolveCustomClaims(ctx context.Context, wallet, namespace string) map[string]string {
if s.claimsResolver == nil {
return nil
}
return s.claimsResolver.ResolveClaims(ctx, wallet, namespace)
}
// ErrRotationNotConfigured is returned by RefreshToken when the service
// wasn't given an rqlite client — refusing to rotate without atomicity
// guarantees is safer than rotating non-atomically.
@ -247,13 +224,8 @@ func (s *Service) IssueTokens(ctx context.Context, wallet, namespace string) (st
return "", "", 0, fmt.Errorf("signing key unavailable")
}
// Resolve namespace-defined additive claims (bugboard #548) ONCE at mint
// time. Stored with the refresh token below and replayed across rotations
// so the 15-min refresh path never re-invokes the provider.
custom := s.resolveCustomClaims(ctx, wallet, namespace)
// Issue access token (15m)
token, expUnix, err := s.GenerateJWT(namespace, wallet, 15*time.Minute, custom)
token, expUnix, err := s.GenerateJWT(namespace, wallet, 15*time.Minute)
if err != nil {
return "", "", 0, fmt.Errorf("failed to generate JWT: %w", err)
}
@ -274,8 +246,8 @@ func (s *Service) IssueTokens(ctx context.Context, wallet, namespace string) (st
db := s.orm.Database()
hashedRefresh := sha256Hex(refresh)
if _, err := db.Query(internalCtx,
"INSERT INTO refresh_tokens(namespace_id, subject, token, audience, expires_at, custom_claims) VALUES (?, ?, ?, ?, datetime('now', '+30 days'), ?)",
nsID, wallet, hashedRefresh, "gateway", marshalClaims(custom),
"INSERT INTO refresh_tokens(namespace_id, subject, token, audience, expires_at) VALUES (?, ?, ?, ?, datetime('now', '+30 days'))",
nsID, wallet, hashedRefresh, "gateway",
); err != nil {
return "", "", 0, fmt.Errorf("failed to store refresh token: %w", err)
}
@ -293,34 +265,6 @@ func (s *Service) IssueTokens(ctx context.Context, wallet, namespace string) (st
// This is the tripwire promised by RFC 9700 §4.12 (refresh-token rotation).
var ErrRefreshTokenReplay = fmt.Errorf("refresh token already rotated or invalid")
// ErrRefreshTransient is returned when refresh-token rotation fails for a
// RETRYABLE reason — an rqlite-layer error rather than a genuine bad/expired
// token. Bugboard #125: during a rolling gateway restart the rqlite leader is
// briefly unavailable (re-election window), so the lookup/rotation errors;
// collapsing that into "invalid token" forces a 401 → full SIWE re-auth, which
// is impossible on a locked device answering a VoIP-woken call. Callers MUST
// surface this as a retryable 503, NOT a 401, so the client retries within the
// ring window instead of tearing down the session.
var ErrRefreshTransient = fmt.Errorf("refresh token rotation temporarily unavailable")
const (
// refreshSelectRetries bounds how many times the refresh lookup is retried
// when the rqlite read errors (transient leader unavailability). The read
// is idempotent and happens BEFORE any write, so retrying is safe.
refreshSelectRetries = 3
// refreshSelectRetryDelay is the backoff between lookup retries. Three
// tries × 250ms rides out a brief leader re-election without adding
// meaningful latency to the common (healthy-leader) path.
refreshSelectRetryDelay = 250 * time.Millisecond
// refreshReuseGrace is how long after a refresh token is rotated (revoked)
// the gateway will still accept it ONE more time, to recover a client whose
// rotation response was lost in transit — otherwise the retry dead-ends in a
// 401 → SIWE, impossible on a VoIP-woken locked screen (bugboard #125, RFC
// 9700 §4.13.2). Kept short, and single-use via grace_used_at, so a stolen
// token cannot be replayed at leisure.
refreshReuseGrace = 60 * time.Second
)
// RefreshToken validates the supplied refresh token, atomically rotates it
// (revokes the old, mints a new), and returns a fresh access token alongside
// the rotated refresh token.
@ -365,146 +309,58 @@ func (s *Service) RefreshToken(ctx context.Context, refreshToken, namespace stri
nsID, err := s.ResolveNamespaceID(ctx, namespace)
if err != nil {
// Bugboard #125: namespace resolution runs an rqlite query BEFORE the
// token lookup, so a leader re-election during a rolling restart fails
// here too. Treat it as retryable (→ 503), not a bad token (→ 401) —
// the refresh-path namespace comes from an already-authenticated
// session, so a resolution failure is a transient DB error, never the
// client's fault.
s.logger.ComponentWarn(logging.ComponentGeneral,
"refresh namespace resolution failed (transient, surfacing retryable)",
zap.String("namespace", namespace),
zap.Error(err))
return "", "", "", 0, ErrRefreshTransient
return "", "", "", 0, err
}
hashedRefresh := sha256Hex(refreshToken)
// Step 1: read the subject. Tells us who the token belongs to AND
// validates that it's currently usable (not revoked, not expired).
//
// Bugboard #125: distinguish a TRANSIENT rqlite error (leader briefly
// unavailable during a rolling restart) from a GENUINE token miss. The
// read is idempotent and pre-write, so we retry it a few times; only after
// exhausting retries do we surface ErrRefreshTransient (→ 503, client
// retries). An actual empty result (Count == 0) is a real bad/expired
// token → "invalid or expired" (→ 401). Collapsing the two used to 401 a
// valid session during every restart, defeating the VoIP-wake refresh.
selectQ := `SELECT subject, custom_claims FROM refresh_tokens
selectQ := `SELECT subject FROM refresh_tokens
WHERE namespace_id = ? AND token = ?
AND revoked_at IS NULL
AND (expires_at IS NULL OR expires_at > datetime('now'))
LIMIT 1`
var res *client.QueryResult
var selErr error
for attempt := 0; attempt < refreshSelectRetries; attempt++ {
res, selErr = ormDB.Query(internalCtx, selectQ, nsID, hashedRefresh)
if selErr == nil && res != nil {
break
}
if attempt < refreshSelectRetries-1 {
time.Sleep(refreshSelectRetryDelay)
}
res, err := ormDB.Query(internalCtx, selectQ, nsID, hashedRefresh)
if err != nil || res == nil || res.Count == 0 {
return "", "", "", 0, fmt.Errorf("invalid or expired refresh token")
}
if selErr != nil || res == nil {
// rqlite error persisted across retries — leader likely mid-election.
// Retryable, NOT an invalid token.
s.logger.ComponentWarn(logging.ComponentGeneral,
"refresh token lookup failed (transient rqlite error, surfacing retryable)",
zap.String("namespace", namespace),
zap.Error(selErr))
return "", "", "", 0, ErrRefreshTransient
}
// graceRecovery is set when the presented token was NOT in the active set
// but qualifies for the bugboard #125 single-use reuse grace (a just-
// rotated token whose rotation response was lost). In that case the old row
// is already revoked, so we SKIP the revoke CAS (step 2) — the grace CAS
// inside tryRefreshReuseGrace is our single-use lock — and go straight to
// minting a fresh session.
graceRecovery := false
var custom map[string]string
if res.Count == 0 {
gSubject, gCustom, gOK, gErr := s.tryRefreshReuseGrace(internalCtx, ormDB, nsID, hashedRefresh)
if gErr != nil {
// Transient rqlite error during the grace lookup/claim — retryable,
// not a verdict on the token (bugboard #125).
s.logger.ComponentWarn(logging.ComponentGeneral,
"refresh reuse-grace lookup failed (transient rqlite error, surfacing retryable)",
zap.String("namespace", namespace), zap.Error(gErr))
return "", "", "", 0, ErrRefreshTransient
if len(res.Rows) > 0 && len(res.Rows[0]) > 0 {
if val, ok := res.Rows[0][0].(string); ok {
subject = val
} else {
b, _ := json.Marshal(res.Rows[0][0])
_ = json.Unmarshal(b, &subject)
}
if !gOK {
// Genuinely not found / revoked outside grace / grace already
// consumed / expired — a real bad token.
return "", "", "", 0, fmt.Errorf("invalid or expired refresh token")
}
subject = gSubject
custom = gCustom
graceRecovery = true
s.logger.ComponentInfo(logging.ComponentGeneral,
"refresh token reuse-grace recovery (lost-response retry, single-use)",
zap.String("namespace", namespace), zap.String("subject", subject))
} else {
var customClaimsJSON string
if len(res.Rows) > 0 && len(res.Rows[0]) > 0 {
if val, ok := res.Rows[0][0].(string); ok {
subject = val
} else {
b, _ := json.Marshal(res.Rows[0][0])
_ = json.Unmarshal(b, &subject)
}
// custom_claims (bugboard #548) — resolved once at login, replayed on
// every rotation so the refresh path never re-invokes the provider.
if len(res.Rows[0]) > 1 {
if cc, ok := res.Rows[0][1].(string); ok {
customClaimsJSON = cc
}
}
}
custom = unmarshalClaims(customClaimsJSON)
}
// Step 2: atomic CAS — revoke the old row. RowsAffected is the lock.
// Two concurrent calls with the same refresh token: exactly one wins
// the UPDATE (RowsAffected == 1); the other sees RowsAffected == 0
// and bails with the replay tripwire.
//
// Skipped on a grace recovery (bugboard #125): the token is ALREADY
// revoked, so this CAS would always see RowsAffected == 0 and mis-fire the
// replay tripwire. The single-use grace CAS (grace_used_at) inside
// tryRefreshReuseGrace already served as the lock for this path.
if !graceRecovery {
updRes, err := s.db.Exec(internalCtx,
`UPDATE refresh_tokens SET revoked_at = datetime('now')
WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL`,
nsID, hashedRefresh)
if err != nil {
// rqlite write error (leader unavailable) — retryable, not a bad
// token. No row was revoked, so a client retry is safe (bugboard #125).
s.logger.ComponentWarn(logging.ComponentGeneral,
"refresh token revoke failed (transient rqlite error, surfacing retryable)",
zap.String("namespace", namespace),
zap.Error(err))
return "", "", "", 0, ErrRefreshTransient
}
affected, _ := updRes.RowsAffected()
if affected == 0 {
// Race lost OR replay attempt: token was unrevoked at step 1 but
// already revoked by step 2, meaning a concurrent call rotated it
// in between. Could be benign (same client retrying due to a
// transient network error) or malicious (stolen token + race).
// Either way: fail closed, log it, let the operator investigate.
s.logger.ComponentWarn(logging.ComponentGeneral,
"refresh token rotation: concurrent use detected (possible replay)",
zap.String("namespace", namespace),
zap.String("subject", subject))
return "", "", "", 0, ErrRefreshTokenReplay
}
updRes, err := s.db.Exec(internalCtx,
`UPDATE refresh_tokens SET revoked_at = datetime('now')
WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL`,
nsID, hashedRefresh)
if err != nil {
return "", "", "", 0, fmt.Errorf("revoke old refresh token: %w", err)
}
affected, _ := updRes.RowsAffected()
if affected == 0 {
// Race lost OR replay attempt: token was unrevoked at step 1 but
// already revoked by step 2, meaning a concurrent call rotated it
// in between. Could be benign (same client retrying due to a
// transient network error) or malicious (stolen token + race).
// Either way: fail closed, log it, let the operator investigate.
s.logger.ComponentWarn(logging.ComponentGeneral,
"refresh token rotation: concurrent use detected (possible replay)",
zap.String("namespace", namespace),
zap.String("subject", subject))
return "", "", "", 0, ErrRefreshTokenReplay
}
// Step 3: mint the new access JWT, carrying forward the stored custom
// claims so a rotated token keeps the same account_id etc. (bugboard #548).
accessToken, expUnix, err = s.GenerateJWT(namespace, subject, 15*time.Minute, custom)
// Step 3: mint the new access JWT.
accessToken, expUnix, err = s.GenerateJWT(namespace, subject, 15*time.Minute)
if err != nil {
return "", "", "", 0, fmt.Errorf("generate access token: %w", err)
}
@ -520,96 +376,15 @@ func (s *Service) RefreshToken(ctx context.Context, refreshToken, namespace stri
}
newRefreshToken = base64.RawURLEncoding.EncodeToString(rbuf)
hashedNew := sha256Hex(newRefreshToken)
// Re-marshal from the parsed map (not the raw stored string) so the new
// row and the freshly-minted access token are provably consistent and
// self-healing — a malformed stored blob converges to "" on both sides
// rather than being propagated forward verbatim. custom_claims is written
// ONLY here and in IssueTokens, both from a sanitized map (bugboard #548).
if _, err := ormDB.Query(internalCtx,
"INSERT INTO refresh_tokens(namespace_id, subject, token, audience, expires_at, custom_claims) VALUES (?, ?, ?, ?, datetime('now', '+30 days'), ?)",
nsID, subject, hashedNew, "gateway", marshalClaims(custom)); err != nil {
// The old token is already revoked (step 2). A retryable error here
// leaves the client to re-attempt — which will re-auth since the old
// token is gone — but that's strictly better than masking a transient
// failure as a permanent 401 (bugboard #125). Surface retryable.
s.logger.ComponentWarn(logging.ComponentGeneral,
"refresh token store failed after revoke (transient rqlite error)",
zap.String("namespace", namespace),
zap.Error(err))
return "", "", "", 0, ErrRefreshTransient
"INSERT INTO refresh_tokens(namespace_id, subject, token, audience, expires_at) VALUES (?, ?, ?, ?, datetime('now', '+30 days'))",
nsID, subject, hashedNew, "gateway"); err != nil {
return "", "", "", 0, fmt.Errorf("store rotated refresh token: %w", err)
}
return accessToken, newRefreshToken, subject, expUnix, nil
}
// tryRefreshReuseGrace implements the bounded, single-use reuse grace for a
// rotated refresh token (bugboard #125, RFC 9700 §4.13.2). A token revoked
// within refreshReuseGrace whose grace_used_at is still NULL is accepted ONCE
// more — recovering a client that lost its rotation response in transit (a
// reconnect storm during a gateway roll) before it dead-ends in a 401 → SIWE.
//
// Returns (subject, custom, true, nil) on a successful single-use grace claim;
// (—, —, false, nil) when there is no eligible row, the token was revoked
// outside the grace window, it has expired, or the grace was already consumed
// (caller → 401). A non-nil error is a transient rqlite failure (caller → 503).
//
// Security: the grace is both short-windowed AND single-use (a CAS on
// grace_used_at), so a stolen token cannot be replayed repeatedly; and it never
// touches the concurrent-rotation replay tripwire, which fires on the active
// path only.
func (s *Service) tryRefreshReuseGrace(ctx context.Context, ormDB client.DatabaseClient, nsID interface{}, hashedRefresh string) (subject string, custom map[string]string, ok bool, err error) {
graceArg := fmt.Sprintf("-%d seconds", int(refreshReuseGrace.Seconds()))
sel := `SELECT subject, custom_claims FROM refresh_tokens
WHERE namespace_id = ? AND token = ?
AND revoked_at IS NOT NULL
AND revoked_at > datetime('now', ?)
AND grace_used_at IS NULL
AND (expires_at IS NULL OR expires_at > datetime('now'))
LIMIT 1`
res, qerr := ormDB.Query(ctx, sel, nsID, hashedRefresh, graceArg)
if qerr != nil {
return "", nil, false, qerr // transient rqlite error → caller 503
}
if res == nil || res.Count == 0 {
return "", nil, false, nil // no eligible grace row → caller 401
}
var customClaimsJSON string
if len(res.Rows) > 0 && len(res.Rows[0]) > 0 {
if v, vok := res.Rows[0][0].(string); vok {
subject = v
} else {
b, _ := json.Marshal(res.Rows[0][0])
_ = json.Unmarshal(b, &subject)
}
if len(res.Rows[0]) > 1 {
if cc, cok := res.Rows[0][1].(string); cok {
customClaimsJSON = cc
}
}
}
if subject == "" {
return "", nil, false, nil // defensive: never grace-mint an anonymous session
}
// Single-use CAS: claim the grace. Exactly one caller wins; a concurrent
// replay of the same just-revoked token sees RowsAffected == 0 → no grace.
// The same time-window predicate is repeated so the claim can't succeed on a
// row that aged out of the window between the SELECT and here.
updRes, uerr := s.db.Exec(ctx,
`UPDATE refresh_tokens SET grace_used_at = datetime('now')
WHERE namespace_id = ? AND token = ? AND grace_used_at IS NULL
AND revoked_at IS NOT NULL AND revoked_at > datetime('now', ?)`,
nsID, hashedRefresh, graceArg)
if uerr != nil {
return "", nil, false, uerr // transient
}
if affected, _ := updRes.RowsAffected(); affected == 0 {
return "", nil, false, nil // grace already consumed (concurrent) → caller 401
}
return subject, unmarshalClaims(customClaimsJSON), true, nil
}
// RevokeToken revokes a specific refresh token or all tokens for a subject
func (s *Service) RevokeToken(ctx context.Context, namespace, token string, all bool, subject string) error {
internalCtx := client.WithInternalAuth(ctx)
@ -620,20 +395,14 @@ func (s *Service) RevokeToken(ctx context.Context, namespace, token string, all
return err
}
// Explicit revocation (logout / revoke-all) ALSO burns the reuse-grace slot
// (grace_used_at) so a deliberately-revoked token can NEVER be recovered by
// the bugboard #125 reuse grace. Rotation does not go through RevokeToken,
// so the legitimate lost-response grace path is unaffected; this only closes
// the logout-bypass where a just-logged-out token would otherwise be
// grace-eligible for the 60s window.
if token != "" {
hashedToken := sha256Hex(token)
_, err := db.Query(internalCtx, "UPDATE refresh_tokens SET revoked_at = datetime('now'), grace_used_at = datetime('now') WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL", nsID, hashedToken)
_, err := db.Query(internalCtx, "UPDATE refresh_tokens SET revoked_at = datetime('now') WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL", nsID, hashedToken)
return err
}
if all && subject != "" {
_, err := db.Query(internalCtx, "UPDATE refresh_tokens SET revoked_at = datetime('now'), grace_used_at = datetime('now') WHERE namespace_id = ? AND subject = ? AND revoked_at IS NULL", nsID, subject)
_, err := db.Query(internalCtx, "UPDATE refresh_tokens SET revoked_at = datetime('now') WHERE namespace_id = ? AND subject = ? AND revoked_at IS NULL", nsID, subject)
return err
}

View File

@ -112,7 +112,7 @@ func TestJWTFlow(t *testing.T) {
sub := "0x1234567890abcdef1234567890abcdef12345678"
ttl := 15 * time.Minute
token, exp, err := s.GenerateJWT(ns, sub, ttl, nil)
token, exp, err := s.GenerateJWT(ns, sub, ttl)
if err != nil {
t.Fatalf("GenerateJWT failed: %v", err)
}
@ -192,7 +192,7 @@ func TestEdDSAJWTFlow(t *testing.T) {
ttl := 15 * time.Minute
// With EdDSA preferred, GenerateJWT should produce an EdDSA token
token, exp, err := s.GenerateJWT(ns, sub, ttl, nil)
token, exp, err := s.GenerateJWT(ns, sub, ttl)
if err != nil {
t.Fatalf("GenerateJWT (EdDSA) failed: %v", err)
}
@ -233,7 +233,7 @@ func TestRS256BackwardCompat(t *testing.T) {
// Generate an RS256 token directly (simulating a legacy token)
s.preferEdDSA = false
token, _, err := s.GenerateJWT("test-ns", "user1", 15*time.Minute, nil)
token, _, err := s.GenerateJWT("test-ns", "user1", 15*time.Minute)
if err != nil {
t.Fatalf("GenerateJWT (RS256) failed: %v", err)
}
@ -447,7 +447,7 @@ func TestEdDSACrossServiceVerify(t *testing.T) {
const wantSub = "BNbN2RNQTsYrrywZCLnhV9j3hd38jwcRqfxBecZX7hDE"
const wantNS = "anchat-test"
token, _, err := signer.GenerateJWT(wantNS, wantSub, 15*time.Minute, nil)
token, _, err := signer.GenerateJWT(wantNS, wantSub, 15*time.Minute)
if err != nil {
t.Fatalf("signer.GenerateJWT: %v", err)
}
@ -478,7 +478,7 @@ func TestEdDSACrossServiceVerify_differentKeysFail(t *testing.T) {
_, verKey, _ := ed25519.GenerateKey(rand.Reader)
verifier.SetEdDSAKey(verKey)
token, _, err := signer.GenerateJWT("ns", "sub", 15*time.Minute, nil)
token, _, err := signer.GenerateJWT("ns", "sub", 15*time.Minute)
if err != nil {
t.Fatalf("GenerateJWT: %v", err)
}

View File

@ -1,178 +0,0 @@
package gateway
import (
"context"
"encoding/json"
"errors"
"sort"
"sync"
"time"
"github.com/DeBrosOfficial/network/pkg/serverless"
"github.com/DeBrosOfficial/network/pkg/serverless/registry"
"go.uber.org/zap"
)
// Claims-provider hook (bugboard #548/#920).
//
// A namespace opts into additive, signed JWT claims by deploying a serverless
// function with the RESERVED name "auth-claims-provider". At /v1/auth/verify
// mint time the gateway invokes it (in the namespace's own context, so it can
// read the namespace's tables) with {"wallet","namespace"} and merges the
// string→string object it returns into the JWT's custom claims — e.g.
// {"account_id":"<users.user_id>"} so push devices key on the stable account
// identity rather than the authenticating wallet.
//
// Hard guarantees:
// - FAIL-OPEN: a missing / slow / erroring / malformed provider yields NO
// claims; authentication never breaks because a claims function is down.
// - Reserved claims (sub/iss/aud/iat/nbf/exp/namespace/custom) can never be
// set by the provider — the gateway controls those.
// - Bounded: timeout, max claim count, max total size.
const (
// claimsProviderFnName is the reserved function name a namespace deploys to
// inject additive JWT claims at mint time.
claimsProviderFnName = "auth-claims-provider"
// claimsProviderTimeout bounds the provider invocation so a slow/hung
// function never stalls the auth path past this budget (fail-open after).
claimsProviderTimeout = 2 * time.Second
// maxCustomClaims / maxCustomClaimsBytes cap what a provider may inject —
// JWTs ride in headers, and an unbounded claim blob is a DoS / cost vector.
maxCustomClaims = 16
maxCustomClaimsBytes = 4096
// claimsProviderWarnInterval rate-limits the fail-open WARN so a broken
// provider doesn't flood the log on every login.
claimsProviderWarnInterval = 30 * time.Second
)
// reservedClaimKeys can never be injected by a namespace claims provider; the
// gateway owns these. A provider that returns any of them has them dropped.
var reservedClaimKeys = map[string]struct{}{
"sub": {}, "iss": {}, "aud": {}, "iat": {},
"nbf": {}, "exp": {}, "namespace": {}, "custom": {},
}
// jwtClaimsProvider implements auth.ClaimsResolver by invoking the namespace's
// reserved auth-claims-provider function.
type jwtClaimsProvider struct {
invoker *serverless.Invoker
logger *zap.Logger
mu sync.Mutex
lastWarnUTC time.Time
}
// newJWTClaimsProvider builds the resolver. A nil invoker disables the hook
// (ResolveClaims returns nil).
func newJWTClaimsProvider(invoker *serverless.Invoker, logger *zap.Logger) *jwtClaimsProvider {
if logger == nil {
logger = zap.NewNop()
}
return &jwtClaimsProvider{invoker: invoker, logger: logger.Named("claims-provider")}
}
// ResolveClaims invokes the namespace's auth-claims-provider and returns the
// sanitized additive claims, or nil. Never errors (fail-open contract).
func (p *jwtClaimsProvider) ResolveClaims(ctx context.Context, wallet, namespace string) map[string]string {
if p.invoker == nil || wallet == "" || namespace == "" {
return nil
}
input, err := json.Marshal(map[string]string{"wallet": wallet, "namespace": namespace})
if err != nil {
return nil
}
callCtx, cancel := context.WithTimeout(ctx, claimsProviderTimeout)
defer cancel()
resp, err := p.invoker.Invoke(callCtx, &serverless.InvokeRequest{
Namespace: namespace,
FunctionName: claimsProviderFnName,
Input: input,
// Gateway-initiated, no end-user caller → system trigger skips the
// per-caller authorization check.
TriggerType: serverless.TriggerTypeInternal,
})
if err != nil || resp == nil {
// The namespace simply hasn't deployed the function (registry miss) is
// the normal no-claims case for most namespaces — stay silent. Any
// other failure is a real problem worth a rate-limited WARN.
if !errors.Is(err, registry.ErrFunctionNotFound) {
p.warnRateLimited("claims provider invoke failed (minting without custom claims)",
namespace, err)
}
return nil
}
if resp.Status != serverless.InvocationStatusSuccess {
p.warnRateLimited("claims provider returned non-success (minting without custom claims)",
namespace, nil)
return nil
}
return sanitizeProviderClaims(resp.Output)
}
// sanitizeProviderClaims parses the provider's RAW stdout as a bare JSON object
// of additive claims (NOT an {ok,result} Ack envelope — per the #976 contract)
// and returns a safe string→string subset: string values only, reserved keys
// dropped, bounded count and total size. Any parse failure → nil (fail-open).
func sanitizeProviderClaims(raw []byte) map[string]string {
if len(raw) == 0 || len(raw) > maxCustomClaimsBytes {
return nil
}
var obj map[string]any
if err := json.Unmarshal(raw, &obj); err != nil || len(obj) == 0 {
return nil
}
// Iterate in sorted key order so an over-budget provider payload truncates
// DETERMINISTICALLY (Go map iteration is randomized) — the same output must
// always yield the same claims, never a per-login-varying subset.
keys := make([]string, 0, len(obj))
for k := range obj {
keys = append(keys, k)
}
sort.Strings(keys)
out := make(map[string]string, len(obj))
total := 0
for _, k := range keys {
if len(out) >= maxCustomClaims {
break
}
if _, reserved := reservedClaimKeys[k]; reserved {
continue
}
s, ok := obj[k].(string) // string→string contract; non-string values dropped
if !ok {
continue
}
total += len(k) + len(s)
if total > maxCustomClaimsBytes {
break
}
out[k] = s
}
if len(out) == 0 {
return nil
}
return out
}
func (p *jwtClaimsProvider) warnRateLimited(msg, namespace string, err error) {
p.mu.Lock()
now := time.Now()
if now.Sub(p.lastWarnUTC) < claimsProviderWarnInterval {
p.mu.Unlock()
return
}
p.lastWarnUTC = now
p.mu.Unlock()
fields := []zap.Field{zap.String("namespace", namespace), zap.String("function", claimsProviderFnName)}
if err != nil {
fields = append(fields, zap.Error(err))
}
p.logger.Warn(msg, fields...)
}

View File

@ -1,98 +0,0 @@
package gateway
import (
"testing"
)
// Bugboard #548: the claims-provider sanitizer is the security boundary —
// a namespace function must NOT be able to forge reserved claims, inject
// non-string values, or blow the size budget.
func TestSanitizeProviderClaims_happyPath(t *testing.T) {
out := sanitizeProviderClaims([]byte(`{"account_id":"u-123","tier":"pro"}`))
if out["account_id"] != "u-123" || out["tier"] != "pro" {
t.Fatalf("expected additive claims, got %v", out)
}
}
func TestSanitizeProviderClaims_dropsReservedKeys(t *testing.T) {
// A malicious provider tries to override sub/exp/namespace — must be dropped.
out := sanitizeProviderClaims([]byte(`{"sub":"0xATTACKER","exp":"9999999999","namespace":"evil","account_id":"u-1"}`))
for _, k := range []string{"sub", "exp", "namespace"} {
if _, present := out[k]; present {
t.Errorf("reserved key %q must be dropped, got %v", k, out)
}
}
if out["account_id"] != "u-1" {
t.Errorf("legitimate claim dropped: %v", out)
}
}
func TestSanitizeProviderClaims_nonStringValuesDropped(t *testing.T) {
out := sanitizeProviderClaims([]byte(`{"account_id":"u-1","num":5,"obj":{"a":1},"arr":[1],"ok":"yes"}`))
if len(out) != 2 || out["account_id"] != "u-1" || out["ok"] != "yes" {
t.Errorf("non-string values must be dropped; got %v", out)
}
}
func TestSanitizeProviderClaims_failOpenOnGarbage(t *testing.T) {
for _, bad := range [][]byte{
nil,
[]byte(``),
[]byte(`not json`),
[]byte(`[1,2,3]`), // array, not object
[]byte(`"just a string"`), // scalar
[]byte(`{}`), // empty object
[]byte(`{"ok":true,"result":{"account_id":"u"}}`), // Ack envelope (wrong shape) → no top-level string claims
} {
if got := sanitizeProviderClaims(bad); got != nil {
t.Errorf("garbage %q must yield nil (fail-open), got %v", bad, got)
}
}
}
func TestSanitizeProviderClaims_countAndSizeCapped(t *testing.T) {
// Way more than maxCustomClaims string entries.
buf := []byte("{")
for i := 0; i < maxCustomClaims+20; i++ {
if i > 0 {
buf = append(buf, ',')
}
buf = append(buf, []byte(`"k`)...)
buf = append(buf, []byte(itoa(i))...)
buf = append(buf, []byte(`":"v"`)...)
}
buf = append(buf, '}')
out := sanitizeProviderClaims(buf)
if len(out) > maxCustomClaims {
t.Errorf("claim count not capped: got %d, max %d", len(out), maxCustomClaims)
}
// Oversized total payload → rejected outright.
big := make([]byte, maxCustomClaimsBytes+10)
for i := range big {
big[i] = 'a'
}
if got := sanitizeProviderClaims(big); got != nil {
t.Errorf("oversized payload must be rejected, got %v", got)
}
}
func TestResolveClaims_nilInvokerOrEmptyArgs(t *testing.T) {
p := newJWTClaimsProvider(nil, nil) // nil invoker disables the hook
if got := p.ResolveClaims(nil, "0xW", "ns"); got != nil {
t.Errorf("nil invoker must yield nil claims, got %v", got)
}
}
func itoa(n int) string {
if n == 0 {
return "0"
}
var b []byte
for n > 0 {
b = append([]byte{byte('0' + n%10)}, b...)
n /= 10
}
return string(b)
}

View File

@ -5,7 +5,6 @@ import (
"database/sql"
"fmt"
"net"
"net/url"
"os"
"path/filepath"
"strings"
@ -479,21 +478,15 @@ func initializeServerless(logger *logging.ColoredLogger, cfg *Config, deps *Depe
// Create secrets manager for serverless functions (AES-256-GCM encrypted).
//
// The encryption key is DERIVED from the cluster secret via HKDF
// (resolveSecretsEncryptionKeyHex), so every gateway in the cluster computes
// the identical key and a secret written on one node decrypts on every other
// node and survives rolling upgrades. This replaces the old per-node
// crypto/rand key file, whose divergence across an upgraded cluster kept
// get_secret broken (bugboard #837). The file key (cfg.SecretsEncryptionKey)
// remains only as a fallback when no cluster secret is available (legacy /
// single-node test rigs). allowEphemeral=false: a missing/invalid key fails
// The encryption key comes from the gateway Config (loaded from
// ~/.orama/secrets/secrets-encryption-key), NOT from engineCfg — engineCfg
// never has the key set, so passing it always produced a per-process
// ephemeral key and made get_secret return undecryptable values
// (bugboard #837). allowEphemeral=false: a missing/invalid key fails
// loudly here and disables get_secret rather than silently corrupting
// secrets.
var secretsMgr serverless.SecretsManager
if secretsKeyHex, keyErr := resolveSecretsEncryptionKeyHex(cfg.ClusterSecret, cfg.SecretsEncryptionKey); keyErr != nil {
logger.ComponentWarn(logging.ComponentGeneral, "Failed to derive secrets encryption key; get_secret will be unavailable",
zap.Error(keyErr))
} else if smImpl, secretsErr := hostfunctions.NewDBSecretsManager(deps.ORMClient, secretsKeyHex, false, logger.Logger); secretsErr != nil {
if smImpl, secretsErr := hostfunctions.NewDBSecretsManager(deps.ORMClient, cfg.SecretsEncryptionKey, false, logger.Logger); secretsErr != nil {
logger.ComponentWarn(logging.ComponentGeneral, "Failed to initialize secrets manager; get_secret will be unavailable",
zap.Error(secretsErr))
} else {
@ -511,7 +504,7 @@ func initializeServerless(logger *logging.ColoredLogger, cfg *Config, deps *Depe
//
// PushDispatcher (legacy) is set only when YAML defaults exist —
// kept for back-compat with code that hasn't migrated to Manager.
pushDispatcher, pushStore, pushManager, pushCfgStore, pushCredManager, err := buildPushDispatcher(cfg, deps.ORMClient, deps.Client, logger)
pushDispatcher, pushStore, pushManager, pushCfgStore, pushCredManager, err := buildPushDispatcher(cfg, deps.ORMClient, logger)
if err != nil {
// Non-fatal: log and continue. Functions calling push_send will get nil
// (silent no-op) and HTTP /v1/push/* endpoints return 503.
@ -655,14 +648,6 @@ func initializeServerless(logger *logging.ColoredLogger, cfg *Config, deps *Depe
authService.SetRqliteClient(deps.ORMClient)
}
// Wire the namespace claims-provider hook (bugboard #548): at JWT mint time
// the auth service invokes the namespace's reserved `auth-claims-provider`
// function (if deployed) and merges its additive claims (e.g. account_id)
// into the token. Fail-open — a missing/slow provider never breaks auth.
if deps.ServerlessInvoker != nil {
authService.SetClaimsResolver(newJWTClaimsProvider(deps.ServerlessInvoker, logger.Logger))
}
// Load or create EdDSA key for new JWT tokens. Bug #215 fix: when
// cfg.ClusterSecret is set, the key is derived deterministically from
// it via HKDF, so every gateway in the cluster shares the same Ed25519
@ -928,7 +913,6 @@ func appendRQLiteQueryParams(dsn string) string {
func buildPushDispatcher(
cfg *Config,
db rqlite.Client,
globalDB client.NetworkClient,
logger *logging.ColoredLogger,
) (*push.PushDispatcher, push.PushDeviceStore, *push.Manager, push.ConfigStore, *pushcreds.Manager, error) {
if cfg.ClusterSecret == "" {
@ -965,25 +949,6 @@ func buildPushDispatcher(
pushcreds.Register(pushapns.NewValidator())
pushcreds.Register(pushntfy.NewValidator())
// ntfy cluster fan-out (bugboard #858): the default push infra runs an
// independent ntfy per node with no shared store, so a publish must reach
// EVERY active node for the subscriber's instance (picked by round-robin
// DNS) to receive it. Build a resolver over the global dns_nodes table; the
// factory attaches it only to providers using the shared default base URL
// (a namespace pointing ntfy at its own server is never fanned across our
// cluster). nil globalDB or an unparseable base URL → no fan-out (provider
// falls back to the single base URL).
var ntfyFanout *ntfyFanoutResolver
var ntfyFanoutHost string
if globalDB != nil {
if base := strings.TrimSpace(cfg.NtfyBaseURL); base != "" {
if u, perr := url.Parse(base); perr == nil && u.Hostname() != "" {
ntfyFanoutHost = u.Hostname()
ntfyFanout = newNtfyFanoutResolver(globalDB, u.Scheme, u.Port(), defaultNtfyFanoutTTL)
}
}
}
// ProviderFactory turns a resolved Config into the right set of
// provider instances. Lives here in dependencies.go because this is
// the only place that imports both the manager package and the
@ -1024,13 +989,6 @@ func buildPushDispatcher(
}
}
if ntfyCfg.BaseURL != "" {
// Fan out across all push nodes ONLY for the shared default infra.
// A namespace that overrode BaseURL with its own ntfy server keeps
// single-host delivery (its server, not our cluster).
if ntfyFanout != nil && ntfyCfg.BaseURL == cfg.NtfyBaseURL {
ntfyCfg.FanoutResolver = ntfyFanout.Hosts
ntfyCfg.FanoutHostHeader = ntfyFanoutHost
}
ps = append(ps, pushntfy.New(ntfyCfg, logger.Logger))
}
if c.ExpoAccessToken != "" {

View File

@ -393,32 +393,6 @@ func TestRefreshHandler_NilAuthService(t *testing.T) {
}
}
// Bugboard #125: a non-bad-token failure (here ErrRotationNotConfigured from a
// service with no rqlite client) must surface as a RETRYABLE 503 with a
// Retry-After header — NOT a 401 that would force a locked device into an
// impossible SIWE re-auth mid-call-ring.
func TestRefreshHandler_TransientError_returns503Retryable(t *testing.T) {
svc, err := authsvc.NewService(testLogger(), nil, "", "default")
if err != nil {
t.Fatalf("failed to create auth service: %v", err)
}
h := NewHandlers(testLogger(), svc, nil, "default", noopInternalAuth)
body, _ := json.Marshal(RefreshRequest{RefreshToken: "some-valid-looking-token"})
req := httptest.NewRequest(http.MethodPost, "/v1/auth/refresh", bytes.NewReader(body))
req.Header.Set("Content-Type", "application/json")
rec := httptest.NewRecorder()
h.RefreshHandler(rec, req)
if rec.Code != http.StatusServiceUnavailable {
t.Fatalf("transient refresh failure must be 503, got %d", rec.Code)
}
if rec.Header().Get("Retry-After") == "" {
t.Error("503 refresh response should carry a Retry-After header")
}
}
// --- APIKeyToJWTHandler tests ---------------------------------------------
func TestAPIKeyToJWTHandler_MissingKey(t *testing.T) {

View File

@ -2,7 +2,6 @@ package auth
import (
"encoding/json"
"errors"
"net/http"
"strings"
"time"
@ -58,7 +57,7 @@ func (h *Handlers) APIKeyToJWTHandler(w http.ResponseWriter, r *http.Request) {
return
}
token, expUnix, err := h.authService.GenerateJWT(ns, key, 15*time.Minute, nil)
token, expUnix, err := h.authService.GenerateJWT(ns, key, 15*time.Minute)
if err != nil {
writeError(w, http.StatusInternalServerError, err.Error())
return
@ -104,20 +103,11 @@ func (h *Handlers) RefreshHandler(w http.ResponseWriter, r *http.Request) {
// the SDK persists it (bug #239 fix) and uses it on the next refresh.
token, newRefreshToken, subject, expUnix, err := h.authService.RefreshToken(r.Context(), req.RefreshToken, req.Namespace)
if err != nil {
// Bugboard #125: a TRANSIENT rotation failure (rqlite leader briefly
// unavailable during a rolling restart) must surface as a retryable
// 503 — NOT a 401 — so the client retries within the call-ring window
// instead of tearing the session down to a full SIWE re-auth, which is
// impossible on a locked device answering a VoIP-woken call.
if errors.Is(err, authsvc.ErrRefreshTransient) || errors.Is(err, authsvc.ErrRotationNotConfigured) {
w.Header().Set("Retry-After", "1")
writeError(w, http.StatusServiceUnavailable, "refresh temporarily unavailable, retry")
return
}
// Genuine bad/expired/replayed token. The service emits a WARN log on
// replay (ErrRefreshTokenReplay) so the operator can investigate. We
// surface a generic 401 regardless — leaking "your token was already
// used" would help an attacker confirm a stolen token was rotated.
// The service emits a WARN log on replay (ErrRefreshTokenReplay)
// so the operator can investigate. We surface a generic 401 here
// regardless — leaking "your token was already used" to the
// caller would help an attacker confirm a stolen token has been
// rotated.
writeError(w, http.StatusUnauthorized, "invalid or expired refresh token")
return
}

View File

@ -157,24 +157,6 @@ func (h *ServerlessHandlers) getJWTSubjectFromRequest(r *http.Request) string {
return strings.TrimSpace(claims.Sub)
}
// getJWTExpiryFromRequest returns the Bearer JWT's `exp` claim (unix seconds)
// if the request was JWT-authenticated, or 0 otherwise (e.g. API-key auth, or
// a token without an exp). Persistent WS connections capture this at upgrade
// to enforce mid-session expiry — a long-lived socket must stop serving RPCs
// once its authorizing token expires, unless refreshed via the #321
// auth.refresh control frame. Bugboard #868.
func (h *ServerlessHandlers) getJWTExpiryFromRequest(r *http.Request) int64 {
v := r.Context().Value(ctxkeys.JWT)
if v == nil {
return 0
}
claims, ok := v.(*auth.JWTClaims)
if !ok || claims == nil {
return 0
}
return claims.Exp
}
// getWalletFromRequest extracts wallet address from JWT.
func (h *ServerlessHandlers) getWalletFromRequest(r *http.Request) string {
// Import strings package functions inline to avoid circular dependencies

View File

@ -1,152 +0,0 @@
package serverless
import (
"context"
"net/http"
"net/http/httptest"
"testing"
"time"
"github.com/DeBrosOfficial/network/pkg/gateway/auth"
"github.com/DeBrosOfficial/network/pkg/gateway/ctxkeys"
)
// TestWSJWTExpired is the core security regression guard for bugboard #868: a
// persistent WS authenticates ONCE at upgrade, and the read loop must stop
// serving application frames once the authorizing JWT is past exp+grace.
//
// If wsJWTExpired starts returning false for a clearly-expired token (or true
// for a still-valid one), an expired token regains full RPC access — including
// turn.credentials minting — for the socket's lifetime.
func TestWSJWTExpired(t *testing.T) {
// Fixed reference instant so the table is deterministic (the read loop
// uses time.Now() in production; the pure function takes `now` for tests).
now := time.Unix(1_700_000_000, 0)
grace := 120 * time.Second
cases := []struct {
name string
expUnix int64
now time.Time
want bool
}{
{
name: "no expiry to enforce (API-key auth, exp=0) never expires",
expUnix: 0,
now: now,
want: false,
},
{
name: "negative exp treated as no-expiry (defensive)",
expUnix: -5,
now: now,
want: false,
},
{
name: "token valid, well before exp",
expUnix: now.Add(10 * time.Minute).Unix(),
now: now,
want: false,
},
{
name: "token just past exp but inside grace window — still allowed",
expUnix: now.Add(-30 * time.Second).Unix(),
now: now,
want: false,
},
{
name: "token exactly at exp+grace boundary — not yet expired (After is strict)",
expUnix: now.Add(-grace).Unix(),
now: now,
want: false,
},
{
name: "token past exp+grace — expired, must reject",
expUnix: now.Add(-(grace + time.Second)).Unix(),
now: now,
want: true,
},
{
name: "token long expired — expired",
expUnix: now.Add(-24 * time.Hour).Unix(),
now: now,
want: true,
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
got := wsJWTExpired(tc.expUnix, tc.now, grace)
if got != tc.want {
t.Errorf("wsJWTExpired(exp=%d, now=%d, grace=%s) = %v; want %v",
tc.expUnix, tc.now.Unix(), grace, got, tc.want)
}
})
}
}
// TestGetJWTExpiryFromRequest verifies the gateway reads the authorizing JWT's
// exp off the request context at upgrade. This is the value the read loop
// enforces for the socket's lifetime (#868); if it silently returns 0 for a
// JWT-authenticated request, expiry enforcement is disabled and the bug
// re-opens.
func TestGetJWTExpiryFromRequest(t *testing.T) {
h := newTestHandlers(nil)
t.Run("JWT with exp returns exp", func(t *testing.T) {
claims := &auth.JWTClaims{Sub: "alice", Exp: 1_700_000_123}
req := httptest.NewRequest(http.MethodGet, "/", nil)
req = req.WithContext(context.WithValue(req.Context(), ctxkeys.JWT, claims))
if got := h.getJWTExpiryFromRequest(req); got != 1_700_000_123 {
t.Errorf("getJWTExpiryFromRequest = %d; want 1700000123", got)
}
})
t.Run("no JWT on context returns 0 (API-key / unauthenticated)", func(t *testing.T) {
req := httptest.NewRequest(http.MethodGet, "/", nil)
if got := h.getJWTExpiryFromRequest(req); got != 0 {
t.Errorf("getJWTExpiryFromRequest = %d; want 0", got)
}
})
t.Run("nil claims under key returns 0", func(t *testing.T) {
req := httptest.NewRequest(http.MethodGet, "/", nil)
var nilClaims *auth.JWTClaims
req = req.WithContext(context.WithValue(req.Context(), ctxkeys.JWT, nilClaims))
if got := h.getJWTExpiryFromRequest(req); got != 0 {
t.Errorf("getJWTExpiryFromRequest = %d; want 0", got)
}
})
}
// TestWSAuthState_refreshExtendsExpiry documents the auth.refresh contract that
// the read loop relies on (#868 + #321): a successful auth.refresh moves the
// enforced expiry forward to the new token's exp, so a socket that refreshes
// before its grace window closes keeps serving RPCs uninterrupted.
//
// We assert the state-transition directly (the full handler needs a live WS
// conn for the ack write; that path is exercised by integration tests). The
// invariant: after refresh, a `now` that WOULD have expired the old token no
// longer expires the socket.
func TestWSAuthState_refreshExtendsExpiry(t *testing.T) {
now := time.Unix(1_700_000_000, 0)
grace := 120 * time.Second
oldExp := now.Add(-(grace + time.Minute)).Unix() // already past grace → expired
state := &wsAuthState{expUnix: oldExp}
if !wsJWTExpired(state.expUnix, now, grace) {
t.Fatalf("precondition: old token should be expired at now")
}
// Simulate what handleAuthRefresh does on success: adopt the new token's
// exp.
newExp := now.Add(15 * time.Minute).Unix()
state.expUnix = newExp
if wsJWTExpired(state.expUnix, now, grace) {
t.Errorf("after refresh the socket must NOT be expired (exp=%d, now=%d)",
state.expUnix, now.Unix())
}
}

View File

@ -22,51 +22,6 @@ import (
// application traffic that goes straight to WASM. Bugboard #321.
var oramaControlFramePrefix = []byte(`"__orama"`)
const (
// wsJWTExpiryGrace is the slack past a JWT's `exp` before the gateway
// stops serving application frames on a persistent WS. It covers clock
// skew between the gateway and the issuing path plus the client's
// refresh round-trip (the #321 auth.refresh control frame). Bugboard
// #868: without this, a socket authenticated ONCE at upgrade keeps full
// RPC access — including turn.credentials minting — for the socket's
// entire lifetime even after the token expires.
//
// Note: on the auth.refresh path ParseAndVerifyJWT independently allows
// its own ±60s exp skew, so worst-case service-past-exp is this grace
// plus that skew (~180s), not 120s flat. Both bounds are deliberate and
// the socket is force-closed once they elapse.
wsJWTExpiryGrace = 120 * time.Second
// wsCloseJWTExpired is the application-specific WS close code sent when a
// persistent socket is torn down for serving past its JWT expiry. It sits
// in the private-use range (4000-4999) and is distinct from protocol
// codes so clients can special-case it as "reconnect with a fresh token".
// Bugboard #868.
wsCloseJWTExpired = 4401
)
// wsAuthState carries the live JWT expiry for a persistent WS across the read
// loop and the auth.refresh control handler. Both run in the SAME goroutine —
// control frames are handled inline in the read loop before any frame reaches
// WASM — so the field needs no synchronization. Bugboard #868.
type wsAuthState struct {
// expUnix is the `exp` (unix seconds) of the JWT currently authorizing
// this socket. 0 means "no expiry to enforce" (e.g. API-key auth or a
// token without exp) — such sockets are exempt from mid-session expiry.
expUnix int64
}
// wsJWTExpired reports whether a persistent WS authorized by a JWT expiring at
// expUnix (unix seconds) has passed its enforcement deadline at time now,
// allowing grace for clock skew + refresh round-trip. expUnix <= 0 means there
// is no expiry to enforce and is never considered expired. Bugboard #868.
func wsJWTExpired(expUnix int64, now time.Time, grace time.Duration) bool {
if expUnix <= 0 {
return false
}
return now.After(time.Unix(expUnix, 0).Add(grace))
}
// oramaControlFrame is the wire shape for gateway-handled control
// frames on a persistent WS. The single Type field discriminates;
// payload fields specific to each Type ride alongside.
@ -142,12 +97,6 @@ func (h *ServerlessHandlers) handlePersistentWebSocket(
invCtx := h.buildPersistentInvocationContext(r, fn, clientID)
callerWallet := invCtx.CallerWallet
// Capture the authorizing JWT's expiry so the read loop can enforce it
// for the socket's lifetime (bugboard #868). A successful auth.refresh
// control frame updates this in place; 0 (non-JWT auth) disables the
// check.
authState := &wsAuthState{expUnix: h.getJWTExpiryFromRequest(r)}
// Instantiate the persistent module. This compiles once (cached) and
// creates one wazero instance bound to this connection.
module, err := h.engine.InstantiatePersistent(r.Context(), fn, invCtx)
@ -247,7 +196,7 @@ func (h *ServerlessHandlers) handlePersistentWebSocket(
// avoids json.Unmarshal for every application frame. Only
// frames carrying the `"__orama"` key get parsed.
if bytes.Contains(frame, oramaControlFramePrefix) {
handled, ackErr := h.handleOramaControlFrame(frame, fn, inst, authState, namespace, clientID, conn)
handled, ackErr := h.handleOramaControlFrame(frame, fn, inst, namespace, clientID, conn)
if ackErr != nil {
h.logger.Warn("persistent WS: control-frame ack write failed",
zap.String("client_id", clientID),
@ -264,26 +213,6 @@ func (h *ServerlessHandlers) handlePersistentWebSocket(
// application frame.
}
// Bugboard #868: a persistent WS authenticates ONCE at upgrade.
// Before handing an application frame to WASM, reject it once the
// authorizing JWT is past exp+grace — otherwise an expired token
// keeps serving RPCs (incl. turn.credentials minting) indefinitely.
// The client keeps the socket alive by sending an
// {"__orama":"auth.refresh"} control frame (handled above, which
// bypasses this check) before the token expires. The check runs
// only on application frames so an expired client can still recover
// via auth.refresh rather than being locked out.
if wsJWTExpired(authState.expUnix, time.Now(), wsJWTExpiryGrace) {
h.logger.Info("persistent WS: closing — JWT expired without refresh",
zap.String("client_id", clientID),
zap.String("namespace", namespace),
zap.Int64("jwt_exp", authState.expUnix))
_ = conn.WriteControl(websocket.CloseMessage,
websocket.FormatCloseMessage(wsCloseJWTExpired, "jwt expired; reconnect with a fresh token"),
time.Now().Add(time.Second))
break
}
if err := inst.Submit(frame); err != nil {
h.logger.Warn("persistent WS submit failed (queue full?)",
zap.String("client_id", clientID),
@ -347,7 +276,6 @@ func (h *ServerlessHandlers) handleOramaControlFrame(
frame []byte,
fn *serverless.Function,
inst *persistent.Instance,
authState *wsAuthState,
namespace, clientID string,
conn *websocket.Conn,
) (handled bool, ackErr error) {
@ -363,7 +291,7 @@ func (h *ServerlessHandlers) handleOramaControlFrame(
switch ctrl.Type {
case "auth.refresh":
return true, h.handleAuthRefresh(ctrl, fn, inst, authState, namespace, clientID, conn)
return true, h.handleAuthRefresh(ctrl, fn, inst, namespace, clientID, conn)
default:
// Unknown control type — ack with an error so the client knows
// the frame was seen but ignored. Treat as handled (don't
@ -384,7 +312,6 @@ func (h *ServerlessHandlers) handleAuthRefresh(
ctrl oramaControlFrame,
fn *serverless.Function,
inst *persistent.Instance,
authState *wsAuthState,
namespace, clientID string,
conn *websocket.Conn,
) error {
@ -480,12 +407,6 @@ func (h *ServerlessHandlers) handleAuthRefresh(
})
}
// Extend the socket's expiry enforcement to the new token's exp so the
// read loop keeps serving RPCs past the old deadline (bugboard #868).
// authState and the read loop share this goroutine, so the write is
// race-free.
authState.expUnix = claims.Exp
h.logger.Info("persistent WS: auth.refresh applied",
zap.String("client_id", clientID),
zap.String("namespace", namespace),

View File

@ -23,7 +23,7 @@ func TestJWTGenerateAndParse(t *testing.T) {
t.Fatalf("failed to create service: %v", err)
}
tok, exp, err := svc.GenerateJWT("ns1", "subj", time.Minute, nil)
tok, exp, err := svc.GenerateJWT("ns1", "subj", time.Minute)
if err != nil || exp <= 0 {
t.Fatalf("gen err=%v exp=%d", err, exp)
}
@ -50,7 +50,7 @@ func TestJWTExpired(t *testing.T) {
}
// Use sufficiently negative TTL to bypass allowed clock skew
tok, _, err := svc.GenerateJWT("ns1", "subj", -2*time.Minute, nil)
tok, _, err := svc.GenerateJWT("ns1", "subj", -2*time.Minute)
if err != nil {
t.Fatalf("gen err=%v", err)
}

View File

@ -51,7 +51,7 @@ func newAuthServiceForTest(t *testing.T) *auth.Service {
func TestAuthMiddleware_WSJWTQuery_validToken(t *testing.T) {
svc := newAuthServiceForTest(t)
token, _, err := svc.GenerateJWT("anchat-test", "0xWALLET_SUBJECT", 15*time.Minute, nil)
token, _, err := svc.GenerateJWT("anchat-test", "0xWALLET_SUBJECT", 15*time.Minute)
if err != nil {
t.Fatalf("GenerateJWT: %v", err)
}
@ -125,7 +125,7 @@ func TestAuthMiddleware_WSJWTQuery_ignoredOnNonWSRequest(t *testing.T) {
// privacy issues of JWTs leaking via referrer headers, browser history,
// and access logs.
svc := newAuthServiceForTest(t)
token, _, err := svc.GenerateJWT("ns", "sub", 15*time.Minute, nil)
token, _, err := svc.GenerateJWT("ns", "sub", 15*time.Minute)
if err != nil {
t.Fatalf("GenerateJWT: %v", err)
}
@ -156,8 +156,8 @@ func TestAuthMiddleware_WSJWTQuery_headerWinsOverQuery(t *testing.T) {
// Header path runs FIRST and wins. Verifies the query fallback is a
// fallback, not an override.
svc := newAuthServiceForTest(t)
headerJWT, _, _ := svc.GenerateJWT("ns-header", "sub-header", 15*time.Minute, nil)
queryJWT, _, _ := svc.GenerateJWT("ns-query", "sub-query", 15*time.Minute, nil)
headerJWT, _, _ := svc.GenerateJWT("ns-header", "sub-header", 15*time.Minute)
queryJWT, _, _ := svc.GenerateJWT("ns-query", "sub-query", 15*time.Minute)
g := &Gateway{authService: svc}
@ -242,7 +242,7 @@ func TestAuthMiddleware_WSJWTQuery_malformedJWTFallsThrough(t *testing.T) {
func TestValidateAuthForNamespaceProxy_WSJWTQuery(t *testing.T) {
svc := newAuthServiceForTest(t)
token, _, err := svc.GenerateJWT("anchat-test", "0xWALLET", 15*time.Minute, nil)
token, _, err := svc.GenerateJWT("anchat-test", "0xWALLET", 15*time.Minute)
if err != nil {
t.Fatalf("GenerateJWT: %v", err)
}
@ -270,7 +270,7 @@ func TestValidateAuthForNamespaceProxy_WSJWTQuery(t *testing.T) {
func TestValidateAuthForNamespaceProxy_WSJWTQuery_ignoredOnNonWS(t *testing.T) {
svc := newAuthServiceForTest(t)
token, _, err := svc.GenerateJWT("anchat-test", "0xWALLET", 15*time.Minute, nil)
token, _, err := svc.GenerateJWT("anchat-test", "0xWALLET", 15*time.Minute)
if err != nil {
t.Fatalf("GenerateJWT: %v", err)
}
@ -295,7 +295,7 @@ func TestValidateAuthForNamespaceProxy_WSJWTQuery_ignoredOnNonWS(t *testing.T) {
// doesn't leak into proxy hops or downstream logs.
func TestAuthMiddleware_WSJWTQuery_strippedAfterVerify(t *testing.T) {
svc := newAuthServiceForTest(t)
token, _, err := svc.GenerateJWT("anchat-test", "0xWALLET", 15*time.Minute, nil)
token, _, err := svc.GenerateJWT("anchat-test", "0xWALLET", 15*time.Minute)
if err != nil {
t.Fatalf("GenerateJWT: %v", err)
}

View File

@ -1,95 +0,0 @@
package gateway
import (
"context"
"fmt"
"sync"
"time"
"github.com/DeBrosOfficial/network/pkg/client"
)
// defaultNtfyFanoutTTL bounds how long the active-push-node list is cached
// before re-querying dns_nodes. Matches the DNS heartbeat cadence, so a node
// added/removed is picked up within a heartbeat without hammering rqlite on
// every push.
const defaultNtfyFanoutTTL = 30 * time.Second
// ntfyFanoutResolver resolves the set of ntfy publish base URLs (one per active
// push node) for fan-out delivery, caching the result for a short TTL. Each
// node runs an independent ntfy with no shared store, so a publish must reach
// every node for the subscriber's instance to receive it (bugboard #858).
type ntfyFanoutResolver struct {
// query returns the public IPs of the currently-active push nodes. Injected
// so the cache/transform logic is unit-testable without a live cluster.
query func(ctx context.Context) ([]string, error)
scheme string // "https" (prod) / "http" (dev), from the configured base URL
port string // explicit port from the base URL, or "" for the scheme default
ttl time.Duration
mu sync.Mutex
cached []string
cachedAt time.Time
}
// newNtfyFanoutResolver builds a resolver backed by the global dns_nodes table.
func newNtfyFanoutResolver(globalDB client.NetworkClient, scheme, port string, ttl time.Duration) *ntfyFanoutResolver {
return &ntfyFanoutResolver{
scheme: scheme,
port: port,
ttl: ttl,
query: func(ctx context.Context) ([]string, error) {
db := globalDB.Database()
res, err := db.Query(client.WithInternalAuth(ctx), "SELECT ip_address FROM dns_nodes WHERE status = 'active'")
if err != nil {
return nil, fmt.Errorf("query active push nodes: %w", err)
}
if res == nil {
return nil, nil
}
ips := make([]string, 0, len(res.Rows))
for _, row := range res.Rows {
if len(row) == 0 {
continue
}
if ip, ok := row[0].(string); ok && ip != "" {
ips = append(ips, ip)
}
}
return ips, nil
},
}
}
// Hosts returns the cached fan-out base URLs, refreshing from the query when the
// cache is stale. On a query error it returns the last-known list (possibly nil)
// alongside the error, so the caller can decide to fall back to its base URL
// rather than dropping a push.
func (r *ntfyFanoutResolver) Hosts(ctx context.Context) ([]string, error) {
r.mu.Lock()
defer r.mu.Unlock()
if r.cached != nil && time.Since(r.cachedAt) < r.ttl {
return r.cached, nil
}
ips, err := r.query(ctx)
if err != nil {
return r.cached, err
}
hosts := make([]string, 0, len(ips))
suffix := ""
if r.port != "" {
suffix = ":" + r.port
}
for _, ip := range ips {
if ip == "" {
continue
}
hosts = append(hosts, r.scheme+"://"+ip+suffix)
}
r.cached = hosts
r.cachedAt = time.Now()
return hosts, nil
}

View File

@ -1,125 +0,0 @@
package gateway
import (
"context"
"errors"
"testing"
"time"
)
// Bugboard #858 — the fan-out resolver turns active dns_nodes into ntfy publish
// base URLs and caches them for a short TTL. These pin the transform + caching.
func TestNtfyFanoutResolver_buildsSchemeHostPort(t *testing.T) {
r := &ntfyFanoutResolver{
scheme: "https",
port: "",
ttl: time.Minute,
query: func(context.Context) ([]string, error) { return []string{"1.2.3.4", "5.6.7.8"}, nil },
}
hosts, err := r.Hosts(context.Background())
if err != nil {
t.Fatalf("Hosts: %v", err)
}
want := []string{"https://1.2.3.4", "https://5.6.7.8"}
if len(hosts) != len(want) {
t.Fatalf("got %v; want %v", hosts, want)
}
for i := range want {
if hosts[i] != want[i] {
t.Errorf("host[%d] = %q; want %q", i, hosts[i], want[i])
}
}
}
func TestNtfyFanoutResolver_includesExplicitPort(t *testing.T) {
r := &ntfyFanoutResolver{
scheme: "http",
port: "8090",
ttl: time.Minute,
query: func(context.Context) ([]string, error) { return []string{"10.0.0.6"}, nil },
}
hosts, _ := r.Hosts(context.Background())
if len(hosts) != 1 || hosts[0] != "http://10.0.0.6:8090" {
t.Errorf("got %v; want [http://10.0.0.6:8090]", hosts)
}
}
func TestNtfyFanoutResolver_skipsEmptyIPs(t *testing.T) {
r := &ntfyFanoutResolver{
scheme: "https",
ttl: time.Minute,
query: func(context.Context) ([]string, error) { return []string{"", "1.2.3.4", ""}, nil },
}
hosts, _ := r.Hosts(context.Background())
if len(hosts) != 1 || hosts[0] != "https://1.2.3.4" {
t.Errorf("got %v; want only the non-empty IP", hosts)
}
}
func TestNtfyFanoutResolver_cachesWithinTTL(t *testing.T) {
calls := 0
r := &ntfyFanoutResolver{
scheme: "https",
ttl: time.Minute,
query: func(context.Context) ([]string, error) {
calls++
return []string{"1.2.3.4"}, nil
},
}
for i := 0; i < 3; i++ {
if _, err := r.Hosts(context.Background()); err != nil {
t.Fatalf("Hosts: %v", err)
}
}
if calls != 1 {
t.Errorf("query called %d times; want 1 (cached within TTL)", calls)
}
}
func TestNtfyFanoutResolver_requeriesAfterTTL(t *testing.T) {
calls := 0
r := &ntfyFanoutResolver{
scheme: "https",
ttl: time.Nanosecond, // expire immediately
query: func(context.Context) ([]string, error) {
calls++
return []string{"1.2.3.4"}, nil
},
}
_, _ = r.Hosts(context.Background())
time.Sleep(time.Millisecond)
_, _ = r.Hosts(context.Background())
if calls != 2 {
t.Errorf("query called %d times; want 2 (TTL expired between calls)", calls)
}
}
func TestNtfyFanoutResolver_queryError_returnsStaleCache(t *testing.T) {
fail := false
r := &ntfyFanoutResolver{
scheme: "https",
ttl: time.Nanosecond,
query: func(context.Context) ([]string, error) {
if fail {
return nil, errors.New("rqlite unreachable")
}
return []string{"1.2.3.4"}, nil
},
}
// Prime the cache.
if _, err := r.Hosts(context.Background()); err != nil {
t.Fatalf("prime: %v", err)
}
time.Sleep(time.Millisecond)
// Now the query fails — Hosts must return the stale cache alongside the error
// so the caller can fall back rather than drop the push.
fail = true
hosts, err := r.Hosts(context.Background())
if err == nil {
t.Fatal("want the query error surfaced")
}
if len(hosts) != 1 || hosts[0] != "https://1.2.3.4" {
t.Errorf("want the stale cache returned on error; got %v", hosts)
}
}

View File

@ -1,49 +0,0 @@
package gateway
import (
"encoding/hex"
"strings"
"github.com/DeBrosOfficial/network/pkg/secrets"
)
// secretsEncryptionDerivePurpose is the HKDF info label used to derive the
// function-secrets AES-256 key from the cluster secret. Deriving it (instead of
// generating a per-node crypto/rand key file) guarantees every gateway in the
// cluster computes the IDENTICAL key, so a secret written on one node decrypts
// on every other node and survives rolling upgrades — eliminating the
// key-divergence / convergence-window class that kept get_secret broken for
// days (bugboard #837). Same pattern as the cluster-wide JWT signing key
// (jwtEdDSADerivePurpose) and the TURN encryption key ("turn-encryption").
//
// Bumping the version label (e.g. "...-v2") is a DELIBERATE rotation that
// invalidates every stored function secret (they must be re-`set`). It must
// never be changed casually.
const secretsEncryptionDerivePurpose = "orama-secrets-encryption-v1"
// resolveSecretsEncryptionKeyHex returns the hex-encoded AES-256 key the
// serverless secrets manager should use to encrypt/decrypt function secrets.
//
// Primary: derive deterministically from the cluster secret via HKDF, so the
// key is identical on every gateway in the cluster and stable across restarts
// and rolling upgrades. The cluster secret is TrimSpace'd first so a stray
// trailing newline on one node's secret file can't silently diverge its derived
// key from the rest of the cluster (the host gateway reads the file untrimmed
// while the namespace gateway trims it — without this they could derive
// different keys and reintroduce #837).
//
// Fallback: when no cluster secret is available (single-node test rigs / legacy
// deployments without a shared secret), fall back to an explicitly-configured
// key file. An empty result then makes the production secrets manager fail loud
// (NewDBSecretsManager with allowEphemeral=false), rather than silently using a
// per-process ephemeral key.
func resolveSecretsEncryptionKeyHex(clusterSecret, fileKeyHex string) (string, error) {
if cs := strings.TrimSpace(clusterSecret); cs != "" {
key, err := secrets.DeriveKey(cs, secretsEncryptionDerivePurpose)
if err != nil {
return "", err
}
return hex.EncodeToString(key), nil
}
return strings.TrimSpace(fileKeyHex), nil
}

View File

@ -1,95 +0,0 @@
package gateway
import (
"encoding/hex"
"testing"
"github.com/DeBrosOfficial/network/pkg/secrets"
)
// Bugboard #837 — the function-secrets AES key must be DERIVED from the cluster
// secret (not a per-node random file), so every gateway computes the identical
// key and stored secrets survive rolling upgrades. These pin the derivation.
func TestResolveSecretsEncryptionKeyHex_deterministic(t *testing.T) {
// Same cluster secret → byte-identical key, every time. This is the whole
// point: any gateway in the cluster derives the same key, so a secret set on
// one node decrypts on all others.
const cs = "cluster-secret-abc123"
a, err := resolveSecretsEncryptionKeyHex(cs, "")
if err != nil {
t.Fatalf("resolve: %v", err)
}
b, err := resolveSecretsEncryptionKeyHex(cs, "")
if err != nil {
t.Fatalf("resolve: %v", err)
}
if a == "" || a != b {
t.Fatalf("derivation not deterministic: %q vs %q", a, b)
}
// Valid AES-256 key: 32 bytes = 64 hex chars.
raw, err := hex.DecodeString(a)
if err != nil || len(raw) != 32 {
t.Errorf("derived key is not 32-byte hex: len(raw)=%d err=%v", len(raw), err)
}
}
func TestResolveSecretsEncryptionKeyHex_trimInvariant(t *testing.T) {
// A trailing newline on one node's cluster-secret file must NOT change the
// derived key — otherwise the host gateway (reads untrimmed) and a namespace
// gateway (reads trimmed) would diverge and reintroduce #837.
trimmed, _ := resolveSecretsEncryptionKeyHex("cluster-secret-abc123", "")
withNL, _ := resolveSecretsEncryptionKeyHex("cluster-secret-abc123\n", "")
withSpaces, _ := resolveSecretsEncryptionKeyHex(" cluster-secret-abc123\t\n", "")
if trimmed != withNL || trimmed != withSpaces {
t.Errorf("derived key is not whitespace-invariant: %q / %q / %q", trimmed, withNL, withSpaces)
}
}
func TestResolveSecretsEncryptionKeyHex_distinctSecretsDistinctKeys(t *testing.T) {
a, _ := resolveSecretsEncryptionKeyHex("cluster-secret-A", "")
b, _ := resolveSecretsEncryptionKeyHex("cluster-secret-B", "")
if a == b {
t.Errorf("distinct cluster secrets must derive distinct keys; both = %q", a)
}
}
func TestResolveSecretsEncryptionKeyHex_purposeSeparatedFromTURN(t *testing.T) {
// The secrets key must NOT equal the TURN key derived from the same cluster
// secret — domain separation via the HKDF info label.
const cs = "cluster-secret-abc123"
secretsHex, _ := resolveSecretsEncryptionKeyHex(cs, "")
turnKey, err := secrets.DeriveKey(cs, "turn-encryption")
if err != nil {
t.Fatalf("derive turn key: %v", err)
}
if secretsHex == hex.EncodeToString(turnKey) {
t.Error("secrets key collides with the TURN key — HKDF purpose label not providing domain separation")
}
}
func TestResolveSecretsEncryptionKeyHex_emptyClusterSecretUsesFileKey(t *testing.T) {
// Legacy/test rigs with no cluster secret fall back to the explicitly
// configured file key (trimmed).
const fileKey = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"
got, err := resolveSecretsEncryptionKeyHex("", fileKey+"\n")
if err != nil {
t.Fatalf("resolve: %v", err)
}
if got != fileKey {
t.Errorf("empty cluster secret should return the trimmed file key; got %q", got)
}
}
func TestResolveSecretsEncryptionKeyHex_emptyBothReturnsEmpty(t *testing.T) {
// No cluster secret AND no file key → empty result, which makes the
// production secrets manager fail loud (allowEphemeral=false) instead of
// silently using an ephemeral key.
got, err := resolveSecretsEncryptionKeyHex("", "")
if err != nil {
t.Fatalf("resolve: %v", err)
}
if got != "" {
t.Errorf("want empty result when neither source has a key; got %q", got)
}
}

View File

@ -86,11 +86,6 @@ type ClusterManager struct {
// Track provisioning operations
provisioningMu sync.RWMutex
provisioning map[string]bool // namespace -> in progress
// Leadership-locality reconciler cooldown (bugboard #708): per-namespace
// timestamp of the last leadership transfer, to bound churn. Lazy-init.
leaderLocalityMu sync.Mutex
leaderLocalityCooldown map[string]time.Time
}
// NewClusterManager creates a new cluster manager
@ -1790,24 +1785,8 @@ func (cm *ClusterManager) saveLocalState(state *ClusterLocalState) error {
return fmt.Errorf("failed to marshal state: %w", err)
}
path := filepath.Join(dir, "cluster-state.json")
// Atomic write: this file now carries the namespace TURN shared secret
// (bugboard #130) and is rewritten from multiple converge paths. Write a
// temp file then rename over the target so a reader (or a concurrent
// writer) never observes a half-written secret — rename is atomic on the
// same filesystem. 0600 + chmod on the temp file keeps the secret out of
// world/group read; the rename then makes the live file 0600 too, which
// also tightens a file an older release left at 0644.
tmp := path + ".tmp"
if err := os.WriteFile(tmp, data, 0600); err != nil {
return fmt.Errorf("failed to write temp state file: %w", err)
}
if err := os.Chmod(tmp, 0600); err != nil {
os.Remove(tmp)
return fmt.Errorf("failed to set temp state file permissions: %w", err)
}
if err := os.Rename(tmp, path); err != nil {
os.Remove(tmp)
return fmt.Errorf("failed to rename state file into place: %w", err)
if err := os.WriteFile(path, data, 0644); err != nil {
return fmt.Errorf("failed to write state file: %w", err)
}
cm.logger.Info("Saved cluster local state", zap.String("namespace", state.NamespaceName), zap.String("path", path))
return nil
@ -1859,78 +1838,12 @@ func (cm *ClusterManager) RestoreLocalClustersFromDisk(ctx context.Context) (int
// restoreWebRTC is the resolved WebRTC gateway config for a restored
// namespace gateway.
const (
// webrtcResolveRetries / webrtcResolveRetryDelay bound how long the converge
// waits for a slow/just-restarted node's namespace rqlite to become readable
// before giving up on the WebRTC secret. A distant node (high WG RTT) can
// take a few seconds to sync; without this it reads empty once and comes up
// with TURN disabled (bugboard #130). 5 × 2s = 10s ceiling on the cold path.
webrtcResolveRetries = 5
webrtcResolveRetryDelay = 2 * time.Second
)
// resolveWebRTCConfigWithRetry calls fetch up to `retries` times, sleeping
// `delay` between attempts, and returns the first result whose error is nil. A
// distant/just-restarted node's namespace rqlite can take a few seconds to
// become readable; without the retry the read fails once and the gateway comes
// up with TURN disabled (bugboard #130). A genuine decrypt failure (stale
// cluster-secret) also errors and exhausts the retries, returning the final
// error so the caller can mark the result unresolved. `sleep` is injected so
// unit tests exercise the loop without real delay.
func resolveWebRTCConfigWithRetry(retries int, delay time.Duration, sleep func(time.Duration), fetch func() (*WebRTCConfig, error)) (*WebRTCConfig, error) {
var cfg *WebRTCConfig
var err error
for attempt := 0; attempt < retries; attempt++ {
cfg, err = fetch()
if err == nil {
return cfg, nil
}
if attempt < retries-1 {
sleep(delay)
}
}
return cfg, err
}
// applyResolvedWebRTCToState copies a freshly-resolved WebRTC config into the
// local cluster state so a future cold start can read the TURN secret from disk
// instead of the (possibly-slow) namespace rqlite (bugboard #130). Returns true
// iff the state changed, so the caller only rewrites the on-disk file when
// there's something to persist. Pure — unit-testable without a live cluster.
func applyResolvedWebRTCToState(state *ClusterLocalState, wr restoreWebRTC) bool {
hasTURN := wr.turnSecret != ""
hasSFU := wr.sfuPort > 0
if state.TURNSharedSecret == wr.turnSecret &&
state.TURNDomain == wr.turnDomain &&
state.TURNStealthDomain == wr.stealthDomain &&
state.SFUSignalingPort == wr.sfuPort &&
state.HasTURN == hasTURN &&
state.HasSFU == hasSFU {
return false
}
state.HasTURN = hasTURN
state.HasSFU = hasSFU
state.TURNSharedSecret = wr.turnSecret
state.TURNDomain = wr.turnDomain
state.TURNStealthDomain = wr.stealthDomain
state.SFUSignalingPort = wr.sfuPort
return true
}
type restoreWebRTC struct {
enabled bool
sfuPort int
turnDomain string
turnSecret string
stealthDomain string // feat-124: empty when webrtc stealth is disabled
// unresolved is true when the DB lookup ERRORED (vs. resolved-but-not-
// enabled) AND the local cache had no secret to fall back to. The caller
// must NOT write a WebRTC-disabled gateway config off an unresolved
// lookup — that silently kills turn.credentials on a node that should
// serve TURN (bugboard #130: a decrypt failure after cluster-secret
// rotation was swallowed into "disabled"). enabled is always false when
// unresolved.
unresolved bool
}
// chooseRestoreWebRTC resolves a restored gateway's WebRTC config. TWO
@ -1943,18 +1856,9 @@ type restoreWebRTC struct {
// - SFU (sfuPort) is PER-NODE — non-zero only when this node runs a
// local SFU (for /v1/webrtc/signal + /rooms proxying).
//
// Precedence: DB-FIRST. The namespace_webrtc_config row is the source of
// truth for the CURRENT TURN secret, so we always consult it. The local
// cluster-state.json cache (dbFetch's counterpart) is a FALLBACK ONLY —
// used when the DB read fails (a slow/just-restarted node whose namespace
// rqlite has not synced yet). This is the bugboard #130 FOLLOW-UP fix: the
// earlier state-FIRST read short-circuited the DB whenever the cache held a
// secret and so NEVER re-validated a present-but-stale cached secret. If a
// secret was rotated (disable→enable) while a node was offline, that node
// kept serving the OLD secret indefinitely. DB-first means a stale cache
// can survive at most until the DB becomes readable on the next converge —
// never indefinitely — while still letting a genuinely DB-down node come up
// on TURN via the cache (the #130 resilience the cache was added for).
// Precedence: prefer the local state file; fall back to the DB (source of
// truth) when the state file lacks the TURN secret (the namespace-wide
// "webrtc is enabled" marker). dbFetch is lazy — only hit when needed.
//
// `enabled` is true when EITHER a TURN secret OR an SFU port is present,
// so the caller knows to write a webrtc block. A non-SFU gateway gets
@ -1965,47 +1869,43 @@ type restoreWebRTC struct {
// standing up the full restore path (systemd spawner + DB + port store).
func chooseRestoreWebRTC(
stateHasSFU bool, stateSFUPort int, stateTURNDomain, stateTURNSecret, stateStealthDomain string,
dbFetch func() (turnSecret, turnDomain, stealthDomain string, sfuPort int, resolved bool),
dbFetch func() (turnSecret, turnDomain, stealthDomain string, sfuPort int),
) restoreWebRTC {
// DB-first: consult the source of truth before trusting the local cache.
dbSecret, dbDomain, dbStealth, dbSFU, resolved := dbFetch()
if resolved {
// The DB read landed and is authoritative. dbSecret == "" means the
// namespace genuinely has no WebRTC enabled — honor that (disable),
// do NOT fall back to a possibly-stale cached secret. A present
// secret is the CURRENT one and wins over any cached value.
if dbSecret == "" {
return restoreWebRTC{}
}
return restoreWebRTC{
enabled: true,
sfuPort: dbSFU,
turnDomain: dbDomain,
turnSecret: dbSecret,
stealthDomain: dbStealth,
}
}
// The DB/decrypt lookup ERRORED (slow node whose namespace rqlite is not
// readable yet, or a decrypt failure after a cluster-secret rotation).
// Fall back to the locally-cached secret so TURN still comes up — possibly
// stale, but functional, and self-correcting on the next converge once the
// DB is readable (NOT indefinite). If the cache is empty too, signal
// unresolved so the caller preserves the running gateway config instead of
// blanking TURN (bugboard #130).
turnSecret := stateTURNSecret
turnDomain := stateTURNDomain
stealthDomain := stateStealthDomain
sfuPort := 0
if stateHasSFU && stateSFUPort > 0 {
sfuPort = stateSFUPort
}
if stateTURNSecret == "" && sfuPort == 0 {
return restoreWebRTC{unresolved: true}
// Fall back to the DB when the state file has no TURN secret — that's
// the marker that the namespace has WebRTC enabled at all. The state
// file is not updated by EnableWebRTC, so a namespace enabled after
// the state file was written reaches here with an empty secret.
// (Stealth toggles DO rewrite cluster state on every node, so the
// state-first read stays fresh for stealthDomain too.)
if turnSecret == "" {
if dbSecret, dbDomain, dbStealth, dbSFU := dbFetch(); dbSecret != "" {
turnSecret = dbSecret
if turnDomain == "" {
turnDomain = dbDomain
}
if stealthDomain == "" {
stealthDomain = dbStealth
}
if sfuPort == 0 {
sfuPort = dbSFU
}
}
}
return restoreWebRTC{
enabled: stateTURNSecret != "" || sfuPort > 0,
enabled: turnSecret != "" || sfuPort > 0,
sfuPort: sfuPort,
turnDomain: stateTURNDomain,
turnSecret: stateTURNSecret,
stealthDomain: stateStealthDomain,
turnDomain: turnDomain,
turnSecret: turnSecret,
stealthDomain: stealthDomain,
}
}
@ -2154,44 +2054,18 @@ func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *Cl
SecretsEncryptionKey: cm.secretsEncryptionKey,
}
// Resolve WebRTC config. DB-FIRST (source of truth for the CURRENT
// secret); the local state cache is consulted only when the DB read
// fails (bugboard #130 follow-up — see chooseRestoreWebRTC). Bugboard
// #25 — the state file is NOT updated by EnableWebRTC, so a namespace
// enabled AFTER its state file was written carries no SFU/TURN fields
// here; reading the DB re-materializes them.
// Resolve WebRTC config. Prefer the local state file; fall back to
// the DB (source of truth) to self-heal stale state. Bugboard #25 —
// the state file is NOT updated by EnableWebRTC, so a namespace
// enabled AFTER its state file was written carries no SFU/TURN
// fields here. The lazy dbFetch only hits the DB when the state
// file is incomplete.
wr := chooseRestoreWebRTC(
state.HasSFU, state.SFUSignalingPort, state.TURNDomain, state.TURNSharedSecret, state.TURNStealthDomain,
func() (turnSecret, turnDomain, stealthDomain string, sfuPort int, resolved bool) {
// Retry the read on a transient error. A distant/slow node's
// namespace rqlite may not be synced/readable yet at cold-start
// converge time — without the retry the read fails once and the
// gateway is written with TURN disabled (bugboard #130). The
// secret IS in the DB; we just need the read to land once the
// follower catches up (typically a few seconds). A genuine
// decrypt failure (stale key) also errors here and will exhaust
// the retries → unresolved → the caller preserves the running
// config rather than blanking it.
webrtcCfg, err := resolveWebRTCConfigWithRetry(
webrtcResolveRetries, webrtcResolveRetryDelay, time.Sleep,
func() (*WebRTCConfig, error) {
return cm.GetWebRTCConfig(ctx, state.NamespaceName)
})
if err != nil {
// Persistent error after retries (slow read that never
// landed, or a decrypt failure). Do NOT swallow into
// "disabled" — surface loudly and signal unresolved so the
// caller preserves the running config (bugboard #130).
cm.logger.Error("WebRTC TURN secret unresolvable on this node after retries — refusing to silently disable TURN; preserving existing gateway config. If this is a cluster-secret rotation, regenerate with `orama namespace disable webrtc` then `orama namespace enable webrtc`.",
zap.String("namespace", state.NamespaceName),
zap.String("node_id", cm.localNodeID),
zap.Int("attempts", webrtcResolveRetries),
zap.Error(err))
return "", "", "", 0, false
}
if webrtcCfg == nil {
// Resolved cleanly: the namespace genuinely has no WebRTC.
return "", "", "", 0, true
func() (turnSecret, turnDomain, stealthDomain string, sfuPort int) {
webrtcCfg, err := cm.GetWebRTCConfig(ctx, state.NamespaceName)
if err != nil || webrtcCfg == nil {
return "", "", "", 0
}
// TURN is namespace-wide; SFU port is per-node and may be
// absent on a gateway-only (non-SFU) node — that's fine,
@ -2203,7 +2077,7 @@ func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *Cl
return webrtcCfg.TURNSharedSecret,
fmt.Sprintf("turn.ns-%s.%s", state.NamespaceName, cm.baseDomain),
cm.stealthDomainFor(state.NamespaceName, webrtcCfg),
sfu, true
sfu
},
)
if wr.enabled {
@ -2215,90 +2089,25 @@ func (cm *ClusterManager) restoreClusterFromState(ctx context.Context, state *Cl
gwCfg.TURNDomain = wr.turnDomain
gwCfg.TURNSecret = wr.turnSecret
gwCfg.TURNStealthDomain = wr.stealthDomain
// Cache the resolved secret into THIS node's local state so that if
// the NEXT cold start can't read the namespace rqlite (a distant/
// slow node whose follower hasn't synced), chooseRestoreWebRTC can
// fall back to this on-disk secret instead of coming up with TURN
// disabled (bugboard #130). The cache is a FALLBACK — DB-first
// resolution still prefers the live DB secret whenever it's
// readable, so this cached value can never pin the node to a stale
// secret. Each node self-heals its own cache on a successful
// resolve; nothing is sent cross-node.
if applyResolvedWebRTCToState(state, wr) {
if err := cm.saveLocalState(state); err != nil {
cm.logger.Warn("Failed to cache resolved WebRTC config to local state (cold start may fall back to the DB read next boot)",
zap.String("namespace", state.NamespaceName), zap.Error(err))
} else {
cm.logger.Info("Cached resolved WebRTC config to local state for cold-start resilience (bugboard #130)",
zap.String("namespace", state.NamespaceName))
}
}
} else if !wr.unresolved {
// The DB read RESOLVED that this namespace has NO WebRTC (disabled).
// Clear any stale cached secret from local state so a future cold
// start that hits a transient DB error can't fall back to it and
// resurrect TURN for a disabled namespace — the hole being: a node
// that was offline during DisableWebRTC never received the cleared
// state push and would otherwise keep serving the old secret. Only
// do this on a RESOLVED-disabled read, NEVER on an unresolved
// (DB-error) one — there the cache IS the fallback and must survive.
if applyResolvedWebRTCToState(state, restoreWebRTC{}) {
if err := cm.saveLocalState(state); err != nil {
cm.logger.Warn("Failed to clear stale cached WebRTC secret from local state after DB reported the namespace disabled",
zap.String("namespace", state.NamespaceName), zap.Error(err))
} else {
cm.logger.Info("Cleared stale cached WebRTC secret from local state (namespace disabled in DB)",
zap.String("namespace", state.NamespaceName))
}
}
}
resp, err := http.Get(fmt.Sprintf("http://localhost:%d/v1/health", pb.GatewayHTTPPort))
if err == nil {
resp.Body.Close()
switch {
case wr.unresolved:
// Bugboard #130 guard: the WebRTC secret could not be resolved
// (DB/decrypt error, logged above). The gateway is already up
// and may be serving TURN from a valid on-disk secret — do NOT
// reconcile it to the empty/disabled block we'd otherwise
// build, which would kill turn.credentials on this node. Leave
// the running config untouched; the operator regenerates the
// secret.
//
// Note: this also defers ReconcileGateway's #837
// secrets-encryption-key reconcile for this one converge pass.
// That is acceptable — the operator action that fixes the
// unresolved TURN secret (regenerate + restart) re-runs the
// full reconcile, and pre-fix this path would have corrupted
// the WebRTC block anyway.
cm.logger.Error("Gateway up but WebRTC secret unresolved — skipping reconcile to avoid disabling TURN on the running config (bugboard #130)",
zap.String("namespace", state.NamespaceName))
default:
// Gateway is already up. Reconcile config drift (bugboard #25 —
// the WARM case): if the running gateway's on-disk config has a
// WebRTC block that differs from the desired (e.g. it lost the
// block on a prior restart where it stayed healthy and the
// cold-spawn path below never ran), rewrite the config +
// restart. ReconcileGateway is a no-op when the on-disk block
// already matches, so this does NOT cause a restart loop.
if rerr := cm.systemdSpawner.ReconcileGateway(ctx, state.NamespaceName, cm.localNodeID, gwCfg); rerr != nil {
cm.logger.Warn("Gateway WebRTC reconcile failed (leaving running config as-is)",
zap.String("namespace", state.NamespaceName), zap.Error(rerr))
}
// Gateway is already up. Reconcile config drift (bugboard #25 —
// the WARM case): if the running gateway's on-disk config has a
// WebRTC block that differs from the desired (e.g. it lost the
// block on a prior restart where it stayed healthy and the
// cold-spawn path below never ran), rewrite the config + restart.
// ReconcileGateway is a no-op when the on-disk block already
// matches, so this does NOT cause a restart loop on every boot.
if rerr := cm.systemdSpawner.ReconcileGateway(ctx, state.NamespaceName, cm.localNodeID, gwCfg); rerr != nil {
cm.logger.Warn("Gateway WebRTC reconcile failed (leaving running config as-is)",
zap.String("namespace", state.NamespaceName), zap.Error(rerr))
}
} else {
// Gateway is down → cold spawn. We must bring a gateway up
// regardless (the namespace needs one); but if the WebRTC secret
// was unresolved we can't write a working TURN block, so warn
// loudly that TURN is degraded on this node until the secret is
// regenerated (bugboard #130).
switch {
case wr.unresolved:
cm.logger.Error("Cold-spawning gateway with TURN UNAVAILABLE — WebRTC secret unresolved on this node; turn.credentials will return namespace_not_configured until it is regenerated (`orama namespace disable webrtc` then `orama namespace enable webrtc`)",
zap.String("namespace", state.NamespaceName))
case wr.enabled && !state.HasSFU:
// Gateway is down → cold spawn with the resolved config.
if wr.enabled && !state.HasSFU {
cm.logger.Info("Re-materialized WebRTC gateway config from DB (state file was stale)",
zap.String("namespace", state.NamespaceName),
zap.Int("sfu_port", wr.sfuPort))

View File

@ -1,71 +0,0 @@
package namespace
import (
"os"
"path/filepath"
"testing"
"go.uber.org/zap"
)
// Bugboard #130 — cluster-state.json carries the namespace TURN shared secret
// (plaintext HMAC), so every writer of it must produce a 0600 file and tighten
// any pre-existing world-readable file on rewrite. SaveClusterState is the
// RECEIVER-side writer that persists state pushed from the coordinator to a
// remote namespace node; without this it landed 0644.
func TestSaveClusterState_writes0600(t *testing.T) {
base := t.TempDir()
s := &SystemdSpawner{namespaceBase: base, logger: zap.NewNop()}
if err := s.SaveClusterState("ns-test", []byte(`{"turn_shared_secret":"sek-123"}`)); err != nil {
t.Fatalf("SaveClusterState: %v", err)
}
path := filepath.Join(base, "ns-test", "cluster-state.json")
info, err := os.Stat(path)
if err != nil {
t.Fatalf("stat cluster-state.json: %v", err)
}
if perm := info.Mode().Perm(); perm != 0600 {
t.Errorf("cluster-state.json mode = %o; want 0600 (it carries the TURN secret)", perm)
}
// No leftover temp file from the atomic write.
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
t.Errorf("temp file should not survive a successful save; stat err = %v", err)
}
}
func TestSaveClusterState_tightensExisting0644(t *testing.T) {
base := t.TempDir()
s := &SystemdSpawner{namespaceBase: base, logger: zap.NewNop()}
// Simulate a file an older release wrote world-readable.
dir := filepath.Join(base, "ns-test")
if err := os.MkdirAll(dir, 0755); err != nil {
t.Fatal(err)
}
path := filepath.Join(dir, "cluster-state.json")
if err := os.WriteFile(path, []byte(`{"old":true}`), 0644); err != nil {
t.Fatal(err)
}
if err := s.SaveClusterState("ns-test", []byte(`{"turn_shared_secret":"sek-new"}`)); err != nil {
t.Fatalf("SaveClusterState: %v", err)
}
info, err := os.Stat(path)
if err != nil {
t.Fatalf("stat cluster-state.json: %v", err)
}
if perm := info.Mode().Perm(); perm != 0600 {
t.Errorf("rewrite did not tighten perms: mode = %o; want 0600", perm)
}
data, err := os.ReadFile(path)
if err != nil {
t.Fatal(err)
}
if string(data) != `{"turn_shared_secret":"sek-new"}` {
t.Errorf("content not replaced atomically: %s", data)
}
}

View File

@ -1,213 +0,0 @@
package namespace
import (
"context"
"net"
"path/filepath"
"time"
"github.com/DeBrosOfficial/network/pkg/rqlite"
"go.uber.org/zap"
)
// Bugboard #708 — namespace raft leadership is geography-blind: the initial
// leader is sortedNodeIDs[0] over random libp2p peer IDs, and raft re-elects
// freely on every restart. When a geographically-distant node (high WireGuard
// RTT to its peers) becomes the leader, EVERY namespace write funnels through
// the distant node and waits on its cross-region replication for quorum — each
// rqlite hop jumps from ~20ms (co-located) to ~256ms, stacking into 5-10s RPCs
// that break calling.
//
// This reconciler keeps namespace leadership on a co-located voter. It NEVER
// removes a node or changes voter membership — all nodes stay voters (quorum
// and fault tolerance unchanged). It only hands leadership OFF a node that is
// isolated from the rest of the cluster, using rqlite's own
// transfer-leadership API.
const (
// leaderLocalityInterval is how often each node checks whether the
// namespace clusters it leads are well-placed.
leaderLocalityInterval = 90 * time.Second
// leaderLocalityRTTThreshold: if the leader's CLOSEST voter peer is farther
// than this, the leader is treated as geographically isolated and hands off
// leadership. Co-located nodes are ~20ms apart; a distant node is ~256ms —
// 100ms cleanly separates the two without false positives.
leaderLocalityRTTThreshold = 100 * time.Millisecond
// leaderLocalityCooldown bounds how often a single namespace's leadership
// is moved. In the common topology (a lone distant node among co-located
// peers) ONE transfer settles leadership on a co-located voter, which then
// stays (it has a nearby peer, so it never re-triggers). In a pathological
// all-mutually-distant topology there is no good leader to move to and the
// nearest-peer transfer would rotate; the cooldown caps that to roughly one
// transfer per node per window (bounded, non-destructive — membership and
// quorum are never touched), and node selection clustering most nodes
// ~20ms apart makes that case rare.
leaderLocalityCooldown = 10 * time.Minute
// leaderLocalityDialTimeout bounds each per-peer RTT probe.
leaderLocalityDialTimeout = 3 * time.Second
)
// decideLeadershipTransfer is the pure decision: should the local leader hand
// off leadership, and to which voter? peerRTTs maps each OTHER reachable voter's
// raft address → measured RTT. Returns a target and true ONLY when this node is
// the leader, every voter is reachable (don't destabilize an already-degraded
// cluster), the cooldown has elapsed, and even the CLOSEST peer is farther than
// `threshold` — i.e. the leader is isolated. If the leader has at least one
// nearby voter it is central enough; leave it. The chosen target is the nearest
// reachable peer (which, in a 1-distant/N-close topology, is a co-located node
// that will then have a nearby peer of its own → stable).
func decideLeadershipTransfer(isLeader, allVotersReachable, cooldownElapsed bool, peerRTTs map[string]time.Duration, threshold time.Duration) (string, bool) {
if !isLeader || !allVotersReachable || !cooldownElapsed || len(peerRTTs) == 0 {
return "", false
}
var bestAddr string
var bestRTT time.Duration
for addr, rtt := range peerRTTs {
if bestAddr == "" || rtt < bestRTT {
bestAddr, bestRTT = addr, rtt
}
}
if bestRTT > threshold {
return bestAddr, true
}
return "", false
}
// measurePeerRTTs probes every OTHER voter's raft address and returns their
// RTTs plus whether ALL voters were reachable+measurable (so the caller can
// refuse to act on a degraded cluster). Non-voters and self are skipped.
func measurePeerRTTs(nodes rqlite.RQLiteNodes, selfID string) (map[string]time.Duration, bool) {
peerRTTs := make(map[string]time.Duration)
allReachable := true
for _, n := range nodes {
if !n.Voter || n.ID == selfID {
continue
}
if !n.Reachable {
allReachable = false
continue
}
dialAddr := n.Address
if dialAddr == "" {
dialAddr = n.ID
}
rtt, derr := measureRaftRTT(dialAddr, leaderLocalityDialTimeout)
if derr != nil {
allReachable = false
continue
}
peerRTTs[n.ID] = rtt
}
return peerRTTs, allReachable
}
// measureRaftRTT returns the TCP-connect time to a peer's raft address — a
// privilege-free proxy for WireGuard round-trip latency.
func measureRaftRTT(raftAddr string, timeout time.Duration) (time.Duration, error) {
start := time.Now()
conn, err := net.DialTimeout("tcp", raftAddr, timeout)
if err != nil {
return 0, err
}
_ = conn.Close()
return time.Since(start), nil
}
func (cm *ClusterManager) leaderTransferCooldownElapsed(namespace string) bool {
cm.leaderLocalityMu.Lock()
defer cm.leaderLocalityMu.Unlock()
last, ok := cm.leaderLocalityCooldown[namespace]
return !ok || time.Since(last) >= leaderLocalityCooldown
}
func (cm *ClusterManager) recordLeaderTransfer(namespace string) {
cm.leaderLocalityMu.Lock()
defer cm.leaderLocalityMu.Unlock()
if cm.leaderLocalityCooldown == nil {
cm.leaderLocalityCooldown = make(map[string]time.Time)
}
cm.leaderLocalityCooldown[namespace] = time.Now()
}
// StartLeaderLocalityReconciler runs the periodic leadership-locality check
// until ctx is cancelled. Safe to call once at node boot.
func (cm *ClusterManager) StartLeaderLocalityReconciler(ctx context.Context) {
go func() {
ticker := time.NewTicker(leaderLocalityInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
cm.reconcileLeaderLocality(ctx)
}
}
}()
}
// reconcileLeaderLocality checks every namespace cluster this node hosts and,
// for any it currently leads from an isolated position, transfers leadership to
// the nearest co-located voter.
func (cm *ClusterManager) reconcileLeaderLocality(ctx context.Context) {
pattern := filepath.Join(cm.baseDataDir, "*", "cluster-state.json")
matches, err := filepath.Glob(pattern)
if err != nil {
cm.logger.Debug("leader-locality: glob failed", zap.Error(err))
return
}
for _, path := range matches {
if ctx.Err() != nil {
return
}
state, err := loadLocalState(path)
if err != nil {
continue
}
cm.reconcileNamespaceLeader(state.NamespaceName, state.LocalPorts.RQLiteHTTPPort)
}
}
// reconcileNamespaceLeader handles a single namespace's leadership locality.
func (cm *ClusterManager) reconcileNamespaceLeader(namespace string, rqliteHTTPPort int) {
if rqliteHTTPPort == 0 {
return
}
status, err := rqlite.GetRaftStatus(rqliteHTTPPort)
if err != nil {
// rqlite not up / not reachable on this node — nothing to do.
return
}
if status.Store.Raft.State != "Leader" {
return // only the leader can transfer leadership away
}
selfID := status.Store.Raft.LeaderID
nodes, err := rqlite.GetRaftNodes(rqliteHTTPPort)
if err != nil {
return
}
peerRTTs, allVotersReachable := measurePeerRTTs(nodes, selfID)
target, transfer := decideLeadershipTransfer(
true, allVotersReachable, cm.leaderTransferCooldownElapsed(namespace),
peerRTTs, leaderLocalityRTTThreshold,
)
if !transfer {
return
}
cm.logger.Info("leader-locality: this node is an isolated namespace raft leader — transferring leadership to a co-located voter (bugboard #708)",
zap.String("namespace", namespace),
zap.String("from", selfID),
zap.String("to", target),
zap.Duration("target_rtt", peerRTTs[target]),
)
// Record the cooldown BEFORE the transfer so a slow/looping transfer can't
// re-fire on the next tick regardless of outcome.
cm.recordLeaderTransfer(namespace)
if err := rqlite.TransferLeadershipTo(rqliteHTTPPort, target, cm.logger); err != nil {
cm.logger.Warn("leader-locality: leadership transfer failed",
zap.String("namespace", namespace), zap.Error(err))
}
}

View File

@ -1,93 +0,0 @@
package namespace
import (
"testing"
"time"
)
// Bugboard #708 — the leadership-locality reconciler hands leadership off a
// geographically-isolated namespace raft leader to the nearest co-located
// voter, without changing membership. These pin the decision logic.
const thr = 100 * time.Millisecond
func TestDecideLeadershipTransfer_isolatedLeaderTransfersToNearest(t *testing.T) {
// Distant leader (109): both peers are far. Transfer to the NEAREST (57 @235ms).
peers := map[string]time.Duration{
"10.0.0.6:10001": 256 * time.Millisecond, // 51
"10.0.0.1:10001": 235 * time.Millisecond, // 57
}
target, transfer := decideLeadershipTransfer(true, true, true, peers, thr)
if !transfer {
t.Fatal("an isolated leader (closest peer 235ms > 100ms) must transfer")
}
if target != "10.0.0.1:10001" {
t.Errorf("must transfer to the NEAREST peer; got %q", target)
}
}
func TestDecideLeadershipTransfer_centralLeaderStays(t *testing.T) {
// Co-located leader (51): has a nearby peer (57 @20ms) and a distant one (109).
// min RTT 20ms < 100ms → leader is central → NO transfer (the correct steady state).
peers := map[string]time.Duration{
"10.0.0.1:10001": 20 * time.Millisecond, // 57 (close)
"10.0.0.11:10001": 256 * time.Millisecond, // 109 (far)
}
if _, transfer := decideLeadershipTransfer(true, true, true, peers, thr); transfer {
t.Error("a leader with a nearby voter is central enough; must NOT transfer")
}
}
func TestDecideLeadershipTransfer_allDistantTransfersToNearest(t *testing.T) {
// Pathological all-mutually-distant topology: every peer is far, so there is
// no truly co-located target. The reconciler still moves to the NEAREST
// (best available); the per-namespace cooldown (TestLeaderTransferCooldown)
// is what bounds the resulting churn to ~one transfer per node per window.
peers := map[string]time.Duration{
"a": 250 * time.Millisecond,
"b": 210 * time.Millisecond,
}
target, transfer := decideLeadershipTransfer(true, true, true, peers, thr)
if !transfer || target != "b" {
t.Errorf("all-distant: expected transfer to nearest 'b'; got transfer=%v target=%q", transfer, target)
}
}
func TestDecideLeadershipTransfer_guards(t *testing.T) {
farPeers := map[string]time.Duration{"p": 300 * time.Millisecond}
if _, transfer := decideLeadershipTransfer(false, true, true, farPeers, thr); transfer {
t.Error("non-leader must never transfer")
}
if _, transfer := decideLeadershipTransfer(true, false, true, farPeers, thr); transfer {
t.Error("must not transfer when a voter is unreachable (degraded cluster)")
}
if _, transfer := decideLeadershipTransfer(true, true, false, farPeers, thr); transfer {
t.Error("must not transfer during cooldown")
}
if _, transfer := decideLeadershipTransfer(true, true, true, map[string]time.Duration{}, thr); transfer {
t.Error("must not transfer with no measurable peers (single-node / all-unreachable)")
}
}
func TestDecideLeadershipTransfer_exactlyThresholdStays(t *testing.T) {
// Closest peer exactly at the threshold is NOT > threshold → stay (no churn at the boundary).
peers := map[string]time.Duration{"p": thr}
if _, transfer := decideLeadershipTransfer(true, true, true, peers, thr); transfer {
t.Error("RTT exactly at the threshold must not trigger a transfer")
}
}
func TestLeaderTransferCooldown(t *testing.T) {
cm := &ClusterManager{}
if !cm.leaderTransferCooldownElapsed("ns") {
t.Error("fresh namespace (no prior transfer) must be out of cooldown")
}
cm.recordLeaderTransfer("ns")
if cm.leaderTransferCooldownElapsed("ns") {
t.Error("immediately after a transfer the namespace must be in cooldown")
}
if !cm.leaderTransferCooldownElapsed("other-ns") {
t.Error("cooldown must be per-namespace")
}
}

View File

@ -1,69 +1,39 @@
package namespace
import (
"errors"
"testing"
"time"
)
import "testing"
// Bugboard #25 — WebRTC config drift on restart + TURN/SFU decouple.
// Bugboard #130 follow-up — DB-FIRST resolution so a stale cached secret can
// never be served indefinitely.
//
// chooseRestoreWebRTC resolves a restored gateway's WebRTC config DB-FIRST
// (the namespace_webrtc_config row is the source of truth for the current
// secret); the local cluster-state.json cache is a FALLBACK consulted only
// when the DB read fails (a slow node whose namespace rqlite hasn't synced).
// It also DECOUPLES the two aspects: TURN (secret + domain) is namespace-wide
// so ANY gateway can serve credentials; the SFU port is per-node (0 on a
// gateway-only node). Pins the drift fallback, the non-SFU-gateway case, and
// the DB-first precedence (DB secret wins over a cached/stale one).
// chooseRestoreWebRTC resolves a restored gateway's WebRTC config from the
// local state file (which EnableWebRTC does NOT update) with a DB fallback
// (source of truth). It also DECOUPLES the two aspects: TURN (secret +
// domain) is namespace-wide so ANY gateway can serve credentials; the SFU
// port is per-node (0 on a gateway-only node). Pins both the drift
// fallback and the non-SFU-gateway case.
// dbFetch signature: () -> (turnSecret, turnDomain, stealthDomain string, sfuPort int, resolved bool).
// resolved=true means the lookup completed (with or without a config);
// resolved=false means it ERRORED (e.g. decrypt failure) → unresolved.
func dbNone() (string, string, string, int, bool) { return "", "", "", 0, true }
// dbFetch signature: () -> (turnSecret, turnDomain, stealthDomain string, sfuPort int).
func dbNone() (string, string, string, int) { return "", "", "", 0 }
// dbError models a DB/decrypt failure: the lookup did not complete.
func dbError() (string, string, string, int, bool) { return "", "", "", 0, false }
func dbFull(secret, domain string, sfuPort int) func() (string, string, string, int, bool) {
return func() (string, string, string, int, bool) { return secret, domain, "", sfuPort, true }
func dbFull(secret, domain string, sfuPort int) func() (string, string, string, int) {
return func() (string, string, string, int) { return secret, domain, "", sfuPort }
}
func TestChooseRestoreWebRTC_dbSecretWinsOverCachedState(t *testing.T) {
// THE #130 FOLLOW-UP (staleness) case. The state file holds a cached
// secret, but the DB (source of truth) has a DIFFERENT, current secret —
// e.g. the secret was rotated (disable→enable) while this node was offline.
// DB-first MUST serve the current DB secret, NOT the stale cached one. The
// old state-first logic short-circuited the DB here and served "old-secret"
// indefinitely.
got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "old-secret", "cdn-old.dbrs.space",
dbFull("new-secret", "turn.ns-x.dbrs.space", 7800))
func TestChooseRestoreWebRTC_stateFileCompleteWins(t *testing.T) {
// State file has TURN secret → use it, and NEVER consult the DB
// (the lazy dbFetch must not be called — saves a query on the hot
// restart path).
dbCalled := false
got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "",
func() (string, string, string, int) { dbCalled = true; return dbNone() })
if !got.enabled {
t.Fatal("DB has a current secret; result must be enabled")
if dbCalled {
t.Error("DB fetch was called even though the state file had the TURN secret (should short-circuit)")
}
if got.turnSecret != "new-secret" {
t.Errorf("BUG #130 STALENESS: turnSecret = %q; want new-secret (the current DB value, not the stale cache)", got.turnSecret)
if !got.enabled || got.sfuPort != 7800 || got.turnSecret != "state-secret" {
t.Errorf("want state-file values; got %+v", got)
}
if got.sfuPort != 7800 || got.turnDomain != "turn.ns-x.dbrs.space" {
t.Errorf("want DB-derived block; got %+v", got)
}
}
func TestChooseRestoreWebRTC_dbDisabledOverridesCachedSecret(t *testing.T) {
// The cache holds a secret but the DB read completes and reports NO WebRTC
// (the namespace was disabled while this node was offline). DB-first must
// honor the disable, NOT keep serving the stale cached secret.
got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "stale-secret", "",
dbNone) // dbNone = resolved, no config
if got.enabled {
t.Errorf("DB reports disabled: must not keep serving the cached secret; got %+v", got)
}
if got.unresolved {
t.Error("a clean resolved-but-disabled lookup must not be marked unresolved")
if got.turnDomain != "turn.ns-x.dbrs.space" {
t.Errorf("turnDomain = %q; want state-file value", got.turnDomain)
}
}
@ -109,19 +79,19 @@ func TestChooseRestoreWebRTC_nonSFUGatewayGetsTURNOnly(t *testing.T) {
}
}
func TestChooseRestoreWebRTC_cachedTurnOnlyFallbackOnDBError(t *testing.T) {
// A non-SFU node holds a cached TURN secret (HasSFU false / port 0) and the
// DB read ERRORS (its namespace rqlite isn't readable yet at cold start).
// DB-first falls back to the cached secret so the gateway still serves TURN
// credentials — sfuPort stays 0 (no local SFU). This is the #130 resilience
// the cache exists for.
got := chooseRestoreWebRTC(false, 0, "turn.ns-x.dbrs.space", "state-secret", "", dbError)
func TestChooseRestoreWebRTC_stateHasTURNButNoSFU(t *testing.T) {
// State file for a non-SFU node: it has the TURN secret but HasSFU is
// false / port 0. Must use the state TURN secret with sfuPort=0 and
// NOT consult the DB (TURN secret present = complete enough).
dbCalled := false
got := chooseRestoreWebRTC(false, 0, "turn.ns-x.dbrs.space", "state-secret", "",
func() (string, string, string, int) { dbCalled = true; return dbNone() })
if !got.enabled || got.sfuPort != 0 || got.turnSecret != "state-secret" {
t.Errorf("want cached TURN-only fallback (sfuPort 0); got %+v", got)
if dbCalled {
t.Error("DB fetch called even though state file had the TURN secret")
}
if got.unresolved {
t.Error("a usable cached secret must not be marked unresolved")
if !got.enabled || got.sfuPort != 0 || got.turnSecret != "state-secret" {
t.Errorf("want TURN-only from state (sfuPort 0); got %+v", got)
}
}
@ -140,7 +110,7 @@ func TestChooseRestoreWebRTC_dbNoSecretStaysDisabled(t *testing.T) {
// enablement marker; without it we treat it as not-configured-for-
// TURN, but an SFU port alone still enables SFU routes.
got := chooseRestoreWebRTC(false, 0, "", "", "",
func() (string, string, string, int, bool) { return "", "turn.db", "", 9000, true })
func() (string, string, string, int) { return "", "turn.db", "", 9000 })
// dbFetch only runs when state secret is empty; here it returns no
// secret, so the `if dbSecret != ""` guard means NOTHING is taken
// from the DB → disabled. (An SFU-only-no-TURN namespace is not a
@ -152,14 +122,16 @@ func TestChooseRestoreWebRTC_dbNoSecretStaysDisabled(t *testing.T) {
// --- feat-124 stealth domain restore precedence ---
func TestChooseRestoreWebRTC_stealthFromCacheOnDBError(t *testing.T) {
// When the DB read errors, the cache fallback carries the whole block —
// including the cached stealth domain — so a stealth-enabled namespace
// keeps advertising its stealth rung on a cold start that can't reach the
// DB yet.
got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "cdn-abc123def456.dbrs.space", dbError)
if !got.enabled || got.stealthDomain != "cdn-abc123def456.dbrs.space" {
t.Errorf("stealthDomain = %q; want cached value on DB-error fallback; got %+v", got.stealthDomain, got)
func TestChooseRestoreWebRTC_stealthFromStateFile(t *testing.T) {
// Stealth toggles rewrite cluster state, so a fresh state file carries
// the stealth domain and must win without a DB call.
got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "cdn-abc123def456.dbrs.space",
func() (string, string, string, int) {
t.Error("DB fetch called even though state file was complete")
return dbNone()
})
if got.stealthDomain != "cdn-abc123def456.dbrs.space" {
t.Errorf("stealthDomain = %q; want state-file value", got.stealthDomain)
}
}
@ -167,210 +139,19 @@ func TestChooseRestoreWebRTC_stealthFromDBOnStaleState(t *testing.T) {
// Stale state (no TURN secret) + DB has stealth enabled → stealth domain
// re-materializes from the DB alongside the rest of the WebRTC block.
got := chooseRestoreWebRTC(false, 0, "", "", "",
func() (string, string, string, int, bool) {
return "db-secret", "turn.ns-x.dbrs.space", "cdn-abc123def456.dbrs.space", 7801, true
func() (string, string, string, int) {
return "db-secret", "turn.ns-x.dbrs.space", "cdn-abc123def456.dbrs.space", 7801
})
if !got.enabled || got.stealthDomain != "cdn-abc123def456.dbrs.space" {
t.Errorf("want stealth domain from DB on stale state; got %+v", got)
}
}
// --- bugboard #130: distinguish "unresolved (DB/decrypt error)" from "disabled" ---
func TestChooseRestoreWebRTC_dbErrorMarksUnresolvedNotDisabled(t *testing.T) {
// The bug-130 case: state file has no secret (freshly-joined node) and
// the DB lookup ERRORS (e.g. the stored TURN secret can't be decrypted
// after a cluster-secret rotation). This MUST surface as unresolved —
// NOT as a clean "disabled" — so the caller preserves the running config
// instead of writing a TURN-disabled gateway (which made turn.credentials
// return namespace_not_configured).
got := chooseRestoreWebRTC(false, 0, "", "", "", dbError)
if !got.unresolved {
t.Fatal("BUG #130 REGRESSION: a DB/decrypt error must mark the result unresolved")
}
if got.enabled {
t.Errorf("unresolved result must never be enabled (would write a config off an errored lookup); got %+v", got)
}
if got.turnSecret != "" {
t.Errorf("unresolved result must carry no secret; got %q", got.turnSecret)
}
}
func TestChooseRestoreWebRTC_resolvedEmptyIsDisabledNotUnresolved(t *testing.T) {
// The contrast case: the DB lookup COMPLETES and reports no WebRTC
// (genuinely disabled namespace). This must be disabled, NOT unresolved —
// the caller is free to write the empty/disabled config here.
got := chooseRestoreWebRTC(false, 0, "", "", "", dbNone)
if got.unresolved {
t.Error("a clean resolved-but-empty lookup must NOT be marked unresolved")
}
if got.enabled {
t.Errorf("genuinely-disabled namespace must be disabled; got %+v", got)
}
}
func TestChooseRestoreWebRTC_cachedSecretSurvivesDBError(t *testing.T) {
// A node that holds the TURN secret in its state file must NOT be disabled
// by a flaky/unsynced DB — when the DB read errors, DB-first falls back to
// the cached secret and stays enabled (not unresolved). Guards against the
// #130 fix accidentally disabling nodes when the DB is briefly unreadable.
got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "", dbError)
if got.unresolved || !got.enabled || got.turnSecret != "state-secret" {
t.Errorf("cached secret must survive a DB error and stay enabled; got %+v", got)
}
}
func TestChooseRestoreWebRTC_noStealthStaysEmpty(t *testing.T) {
// Stealth disabled → empty stealthDomain (gateway advertises the baseline
// 3-rung ladder only). Uses the cache-fallback path (DB error) so an
// enabled-but-no-stealth config is exercised end to end.
got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "", dbError)
if !got.enabled || got.stealthDomain != "" {
t.Errorf("stealthDomain = %q; want empty when stealth is disabled; got %+v", got.stealthDomain, got)
}
}
// ----------------------------------------------------------------------------
// Bugboard #130 — cache the resolved WebRTC secret into local state so a slow
// node's cold start reads it from disk instead of the (slow) namespace rqlite.
// ----------------------------------------------------------------------------
func TestApplyResolvedWebRTCToState_populatesAndReportsChange(t *testing.T) {
st := &ClusterLocalState{} // fresh node: no cached secret (the #130 gap)
wr := restoreWebRTC{enabled: true, turnSecret: "sek-123", turnDomain: "turn.ns-x.dbrs.space", stealthDomain: "cdn-abc.dbrs.space", sfuPort: 30000}
if !applyResolvedWebRTCToState(st, wr) {
t.Fatal("expected change=true when caching a secret into empty state")
}
if st.TURNSharedSecret != "sek-123" {
t.Errorf("TURNSharedSecret = %q; want sek-123 (must be cached for cold start)", st.TURNSharedSecret)
}
if !st.HasTURN || !st.HasSFU || st.SFUSignalingPort != 30000 ||
st.TURNDomain != "turn.ns-x.dbrs.space" || st.TURNStealthDomain != "cdn-abc.dbrs.space" {
t.Errorf("state not fully populated: %+v", st)
}
// The whole point of caching: on a SECOND boot where the DB read fails
// (slow node, namespace rqlite not synced), the cached secret lets the
// gateway still come up on TURN (DB-first falls back to the cache).
got := chooseRestoreWebRTC(st.HasSFU, st.SFUSignalingPort, st.TURNDomain, st.TURNSharedSecret, st.TURNStealthDomain, dbError)
if !got.enabled || got.unresolved || got.turnSecret != "sek-123" {
t.Errorf("cached cold start should fall back to the state secret on a DB error; got %+v", got)
}
}
func TestApplyResolvedWebRTCToState_noChangeWhenAlreadyCached(t *testing.T) {
st := &ClusterLocalState{HasTURN: true, HasSFU: true, TURNSharedSecret: "sek-123", TURNDomain: "d", TURNStealthDomain: "s", SFUSignalingPort: 30000}
wr := restoreWebRTC{enabled: true, turnSecret: "sek-123", turnDomain: "d", stealthDomain: "s", sfuPort: 30000}
if applyResolvedWebRTCToState(st, wr) {
t.Error("expected change=false (no rewrite) when state already matches the resolved config")
}
}
func TestApplyResolvedWebRTCToState_turnOnlyNode_noSFU(t *testing.T) {
// A gateway-only node (serves TURN credentials, runs no local SFU): secret
// set, sfuPort 0. Must still cache the secret + report HasTURN, HasSFU=false.
st := &ClusterLocalState{}
if !applyResolvedWebRTCToState(st, restoreWebRTC{enabled: true, turnSecret: "sek", turnDomain: "d", sfuPort: 0}) {
t.Fatal("want change=true")
}
if !st.HasTURN || st.HasSFU || st.TURNSharedSecret != "sek" {
t.Errorf("turn-only node: want HasTURN=true HasSFU=false secret cached; got %+v", st)
}
}
func TestApplyResolvedWebRTCToState_clearsCacheOnDisable(t *testing.T) {
// When the DB resolves the namespace as DISABLED, the caller applies an
// empty restoreWebRTC to wipe any stale cached secret from local state — so
// a node that was offline during DisableWebRTC can't later fall back to the
// old secret on a transient DB error and resurrect TURN for a disabled
// namespace. Must report change=true and zero out the cached fields.
st := &ClusterLocalState{HasTURN: true, HasSFU: true, TURNSharedSecret: "stale-secret", TURNDomain: "turn.ns-x.dbrs.space", SFUSignalingPort: 7800}
if !applyResolvedWebRTCToState(st, restoreWebRTC{}) {
t.Fatal("disable: want change=true when clearing a cached secret")
}
if st.TURNSharedSecret != "" || st.HasTURN || st.HasSFU || st.SFUSignalingPort != 0 || st.TURNDomain != "" {
t.Errorf("cache not fully cleared on disable: %+v", st)
}
}
func TestApplyResolvedWebRTCToState_secretRotationReportsChange(t *testing.T) {
// Secret rotation: the state holds an OLD cached secret and a fresh resolve
// brings the NEW (rotated) secret. applyResolvedWebRTCToState MUST report
// change=true and overwrite the cache, so the node's fallback secret tracks
// the rotation instead of persisting a stale value on disk (bugboard #130
// follow-up — the cache must never lag the rotated secret).
st := &ClusterLocalState{HasTURN: true, TURNSharedSecret: "old-secret", TURNDomain: "turn.ns-x.dbrs.space"}
wr := restoreWebRTC{enabled: true, turnSecret: "new-secret", turnDomain: "turn.ns-x.dbrs.space"}
if !applyResolvedWebRTCToState(st, wr) {
t.Fatal("rotation: want change=true when the resolved secret differs from the cached one")
}
if st.TURNSharedSecret != "new-secret" {
t.Errorf("cache not updated to the rotated secret: got %q; want new-secret", st.TURNSharedSecret)
}
}
// ----------------------------------------------------------------------------
// Bugboard #130 — the cold-start read retries so a slow node's namespace
// rqlite read lands once the follower syncs, instead of failing once and
// coming up with TURN disabled.
// ----------------------------------------------------------------------------
func TestResolveWebRTCConfigWithRetry_succeedsOnNthAttempt(t *testing.T) {
// The read errors on the first two attempts (rqlite not readable yet) then
// succeeds — the retry must return the config and not surface the earlier
// transient errors.
calls := 0
slept := 0
cfg, err := resolveWebRTCConfigWithRetry(5, time.Millisecond, func(time.Duration) { slept++ },
func() (*WebRTCConfig, error) {
calls++
if calls < 3 {
return nil, errors.New("rqlite not readable yet")
}
return &WebRTCConfig{TURNSharedSecret: "sek-123"}, nil
})
if err != nil {
t.Fatalf("want success on the 3rd attempt; got err %v", err)
}
if cfg == nil || cfg.TURNSharedSecret != "sek-123" {
t.Fatalf("want resolved config; got %+v", cfg)
}
if calls != 3 {
t.Errorf("want exactly 3 fetch attempts; got %d", calls)
}
if slept != 2 {
t.Errorf("want a sleep between each of the 2 failed attempts; got %d", slept)
}
}
func TestResolveWebRTCConfigWithRetry_exhaustsAndReturnsError(t *testing.T) {
// A persistent error (e.g. a decrypt failure after cluster-secret rotation)
// must exhaust all attempts and return the final error — the caller maps
// that to unresolved (NOT disabled). No sleep after the final attempt.
calls := 0
slept := 0
cfg, err := resolveWebRTCConfigWithRetry(4, time.Millisecond, func(time.Duration) { slept++ },
func() (*WebRTCConfig, error) {
calls++
return nil, errors.New("decrypt failed")
})
if err == nil {
t.Fatal("want the final error after exhausting retries; got nil")
}
if cfg != nil {
t.Errorf("want nil config on exhaustion; got %+v", cfg)
}
if calls != 4 {
t.Errorf("want 4 attempts (all retries used); got %d", calls)
}
if slept != 3 {
t.Errorf("want a sleep between attempts but not after the last; got %d", slept)
// Stealth disabled everywhere → empty stealthDomain (gateway advertises
// the baseline 3-rung ladder only).
got := chooseRestoreWebRTC(true, 7800, "turn.ns-x.dbrs.space", "state-secret", "", dbNone)
if got.stealthDomain != "" {
t.Errorf("stealthDomain = %q; want empty when stealth is disabled", got.stealthDomain)
}
}

View File

@ -801,23 +801,8 @@ func (s *SystemdSpawner) SaveClusterState(namespace string, data []byte) error {
return fmt.Errorf("failed to create namespace dir: %w", err)
}
path := filepath.Join(dir, "cluster-state.json")
// Atomic write to a temp file + rename: cluster-state.json carries the
// namespace TURN shared secret (bugboard #130), so it must not be
// world/group readable on the receiving node either, and a reader must
// never see a half-written secret. 0600 + chmod on the temp file keeps the
// secret private; the rename then makes the live file 0600 too, tightening
// a file an older release wrote 0644.
tmp := path + ".tmp"
if err := os.WriteFile(tmp, data, 0600); err != nil {
return fmt.Errorf("failed to write temp cluster state: %w", err)
}
if err := os.Chmod(tmp, 0600); err != nil {
os.Remove(tmp)
return fmt.Errorf("failed to set cluster state permissions: %w", err)
}
if err := os.Rename(tmp, path); err != nil {
os.Remove(tmp)
return fmt.Errorf("failed to rename cluster state into place: %w", err)
if err := os.WriteFile(path, data, 0644); err != nil {
return fmt.Errorf("failed to write cluster state: %w", err)
}
s.logger.Info("Saved cluster state from coordinator",
zap.String("namespace", namespace),

View File

@ -161,13 +161,6 @@ func (n *Node) startHTTPGateway(ctx context.Context) error {
zap.String("base_domain", clusterCfg.BaseDomain),
zap.String("base_data_dir", baseDataDir))
// Keep namespace raft leadership on co-located voters (bugboard #708):
// a geography-blind raft election can place leadership on a distant
// node, funneling every write across a ~256ms link into 5-10s RPCs.
// This reconciler hands leadership off an isolated leader to the nearest
// voter — never changing membership (all nodes stay voters).
clusterManager.StartLeaderLocalityReconciler(ctx)
// Restore previously-running namespace cluster processes in background.
// First try local state files (no DB dependency), then fall back to DB query with retries.
go func() {

View File

@ -4,7 +4,6 @@ import (
"context"
"errors"
"fmt"
"net/http"
"sync"
"go.uber.org/zap"
@ -186,12 +185,6 @@ func (d *PushDispatcher) SendToUserDetailed(
out.Ok = false
} else {
r.Success = true
// Record the success status explicitly. A provider Send returns nil
// only on a 2xx delivery, so surface 200 instead of leaving
// HTTPStatus at its zero value — otherwise a successful push logs
// "http=0", which reads like an opaque failure and masks real
// false-success classes (bugboard #132).
r.HTTPStatus = http.StatusOK
out.DevicesSucceeded++
}
out.Results = append(out.Results, r)

View File

@ -20,15 +20,6 @@ import (
// provider's 5s because APNs is HTTP/2 + connection-reused.
const defaultSendTimeout = 10 * time.Second
// voipPushExpiry caps the apns-expiration on VoIP (call-invite) pushes to the
// ring window. A call signal that can't be delivered within this window is
// worse than undelivered: without an expiration APNs store-and-forwards it and
// lands it MINUTES later, firing a phantom "missed call" ring on the device and
// burning PushKit goodwill (bugboard #132). With it, APNs delivers promptly or
// DISCARDS — never a stale invite. Alert pushes keep the default
// store-and-forward behavior.
const voipPushExpiry = 30 * time.Second
// Provider is the APNs push.PushProvider implementation, scoped to one
// (Team ID, Key ID, p8 key, Bundle ID, Environment, Kind) tuple.
// Construct one per (namespace, kind) via the gateway dependency
@ -178,15 +169,6 @@ func (p *Provider) Send(ctx context.Context, msg push.PushMessage) error {
n.Priority = apns2.PriorityLow
}
// Cap VoIP expiration to the ring window so APNs never store-and-forwards a
// stale call-invite into a phantom missed-call ring (bugboard #132). Without
// this, apns2 omits apns-expiration and APNs stores+retries for its default
// (minutes to days). Alert pushes intentionally keep the default so a
// message notification still lands after the device reconnects.
if p.kind == KindVoIP {
n.Expiration = time.Now().Add(voipPushExpiry)
}
// PushWithContext propagates cancellation through to the HTTP/2
// stream — abandoning ctx terminates the in-flight request, no
// goroutine leak.

View File

@ -4,7 +4,6 @@ import (
"context"
"net/http"
"testing"
"time"
"github.com/DeBrosOfficial/network/pkg/push"
"github.com/sideshow/apns2"
@ -186,43 +185,3 @@ func TestAlert_Send_EmptyContentStillRejected(t *testing.T) {
t.Fatal("alert path should still reject empty-content (bugboard #348); got nil")
}
}
// Bugboard #132: VoIP call-invites MUST carry a short apns-expiration so APNs
// never store-and-forwards a stale invite into a phantom missed-call ring
// minutes later. Without it apns2 omits the header → store-and-forward.
func TestVoIP_Send_ExpirationCappedToRingWindow(t *testing.T) {
fake := &fakePushClient{resp: &apns2.Response{StatusCode: http.StatusOK, ApnsID: "voip-exp"}}
p := newTestProviderKind(t, "com.example.app", KindVoIP, fake)
before := time.Now()
if err := p.Send(context.Background(), push.PushMessage{
DeviceToken: "VOIP-TOKEN",
Data: map[string]interface{}{"call_id": "x"},
}); err != nil {
t.Fatalf("Send: %v", err)
}
exp := fake.lastSent.Expiration
if exp.IsZero() {
t.Fatal("VoIP push has NO apns-expiration — APNs store-and-forwards → late phantom ring (#132)")
}
if !exp.After(before) {
t.Errorf("expiration %v not in the future (before=%v)", exp, before)
}
if exp.After(before.Add(voipPushExpiry + 2*time.Second)) {
t.Errorf("expiration %v exceeds the ring-window cap (%s) — would allow a late ring", exp, voipPushExpiry)
}
}
// Alert (message) pushes intentionally keep store-and-forward (no expiration) so
// a notification still lands after reconnect — only the VoIP path is capped.
func TestAlert_Send_NoExpiration_keepsStoreAndForward(t *testing.T) {
fake := &fakePushClient{resp: &apns2.Response{StatusCode: http.StatusOK, ApnsID: "alert-1"}}
p := newTestProviderKind(t, "com.example.app", KindAlert, fake)
if err := p.Send(context.Background(), push.PushMessage{
DeviceToken: "ALERT-TOKEN", Title: "hi", Body: "msg",
}); err != nil {
t.Fatalf("Send: %v", err)
}
if !fake.lastSent.Expiration.IsZero() {
t.Errorf("alert push set expiration %v; want none (store-and-forward)", fake.lastSent.Expiration)
}
}

View File

@ -23,14 +23,12 @@ package ntfy
import (
"context"
"crypto/tls"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"sync"
"time"
"github.com/DeBrosOfficial/network/pkg/push"
@ -47,34 +45,14 @@ type Config struct {
AuthToken string
// Timeout bounds each Send call. 0 selects 5 seconds.
Timeout time.Duration
// FanoutResolver, when set, returns the set of ntfy publish base URLs to
// deliver EACH publish to — one per active push node. The cluster runs an
// independent ntfy per node with NO shared message store, while subscribers
// are scattered across nodes by round-robin DNS; a publish that lands on one
// node only reaches subscribers on that node, losing ~(N-1)/N (bugboard
// #858). Fanning a publish to EVERY node guarantees it reaches whichever
// instance the subscriber's connection landed on. When nil, or it returns no
// hosts (or errors), Send falls back to the single BaseURL — so push never
// breaks if node discovery is unavailable.
FanoutResolver func(ctx context.Context) ([]string, error)
// FanoutHostHeader, when set, overrides the HTTP Host header and TLS SNI on
// fan-out requests. Needed because FanoutResolver returns per-node addresses
// (IPs) but each node's reverse proxy (Caddy) routes by — and serves its TLS
// cert for — the public push hostname. Empty: no override (tests /
// homogeneous hosts).
FanoutHostHeader string
}
// Provider is the ntfy push.PushProvider implementation.
type Provider struct {
baseURL string
authToken string
httpClient *http.Client
fanoutClient *http.Client
fanoutResolver func(ctx context.Context) ([]string, error)
fanoutHostHeader string
logger *zap.Logger
baseURL string
authToken string
httpClient *http.Client
logger *zap.Logger
}
// New creates a Provider with the given config.
@ -86,37 +64,18 @@ func New(cfg Config, logger *zap.Logger) *Provider {
if timeout <= 0 {
timeout = 5 * time.Second
}
p := &Provider{
baseURL: strings.TrimRight(cfg.BaseURL, "/"),
authToken: cfg.AuthToken,
httpClient: &http.Client{Timeout: timeout},
fanoutResolver: cfg.FanoutResolver,
fanoutHostHeader: cfg.FanoutHostHeader,
logger: logger.Named("ntfy"),
return &Provider{
baseURL: strings.TrimRight(cfg.BaseURL, "/"),
authToken: cfg.AuthToken,
httpClient: &http.Client{Timeout: timeout},
logger: logger.Named("ntfy"),
}
if cfg.FanoutResolver != nil {
// Fan-out requests dial per-node addresses but must present the public
// push hostname for SNI so each node's Caddy serves the right cert and
// routes to its local ntfy. A dedicated client carries that fixed SNI.
tr := &http.Transport{}
if cfg.FanoutHostHeader != "" {
tr.TLSClientConfig = &tls.Config{ServerName: cfg.FanoutHostHeader}
}
p.fanoutClient = &http.Client{Timeout: timeout, Transport: tr}
}
return p
}
// Name implements push.PushProvider.
func (p *Provider) Name() string { return "ntfy" }
// Send delivers a push notification to the device's ntfy topic.
//
// When a FanoutResolver is configured, the publish is delivered to EVERY active
// push node (the ntfy instances don't share state, so the subscriber's instance
// — whichever the round-robin LB picked — must be among the targets), and Send
// succeeds as long as at least one instance accepted it (bugboard #858).
// Otherwise it publishes to the single configured BaseURL.
func (p *Provider) Send(ctx context.Context, msg push.PushMessage) error {
if msg.DeviceToken == "" {
return push.ErrEmptyToken
@ -125,7 +84,7 @@ func (p *Provider) Send(ctx context.Context, msg push.PushMessage) error {
return fmt.Errorf("ntfy: base URL not configured")
}
topic, err := p.resolveTopic(msg.DeviceToken)
endpointURL, err := p.resolveEndpoint(msg.DeviceToken)
if err != nil {
return err
}
@ -143,73 +102,10 @@ func (p *Provider) Send(ctx context.Context, msg push.PushMessage) error {
body = string(b)
}
// Resolve the set of base URLs to publish to. Default: the single base URL.
// With a fan-out resolver, publish to every active push node so the
// subscriber's instance is always covered. Resolver failure is non-fatal —
// fall back to the base URL so push keeps working.
bases := []string{p.baseURL}
httpClient := p.httpClient
hostHeader := ""
if p.fanoutResolver != nil {
if hosts, rerr := p.fanoutResolver(ctx); rerr != nil {
p.logger.Warn("ntfy fan-out node resolution failed; publishing to base URL only", zap.Error(rerr))
} else if len(hosts) > 0 {
bases = hosts
httpClient = p.fanoutClient
hostHeader = p.fanoutHostHeader
}
}
if len(bases) == 1 {
return p.postOne(ctx, httpClient, bases[0], topic, body, msg, hostHeader)
}
// Fan out concurrently. Success = at least one instance accepted the
// publish (the message is in the cluster). A node that's down is logged but
// does not fail the Send, since the message still reaches every reachable
// instance — including, in the common case, the subscriber's.
var wg sync.WaitGroup
errs := make([]error, len(bases))
for i, base := range bases {
wg.Add(1)
go func(i int, base string) {
defer wg.Done()
errs[i] = p.postOne(ctx, httpClient, base, topic, body, msg, hostHeader)
}(i, base)
}
wg.Wait()
okCount := 0
var firstErr error
for _, e := range errs {
if e == nil {
okCount++
} else if firstErr == nil {
firstErr = e
}
}
if okCount == 0 {
return fmt.Errorf("ntfy: fan-out to all %d push nodes failed: %w", len(bases), firstErr)
}
if okCount < len(bases) {
p.logger.Warn("ntfy fan-out partial failure (message still delivered to the reachable instances)",
zap.Int("delivered", okCount), zap.Int("total", len(bases)), zap.Error(firstErr))
}
return nil
}
// postOne publishes a single (already-resolved) topic+body to one ntfy base URL.
// hostHeader, when non-empty, overrides the HTTP Host header so a request dialed
// at a node IP is still routed by the node's proxy as the public push hostname.
func (p *Provider) postOne(ctx context.Context, httpClient *http.Client, base, topic, body string, msg push.PushMessage, hostHeader string) error {
endpointURL := strings.TrimRight(base, "/") + "/" + topic
req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpointURL, strings.NewReader(body))
if err != nil {
return fmt.Errorf("ntfy: build request: %w", err)
}
if hostHeader != "" {
req.Host = hostHeader
}
if msg.Title != "" {
req.Header.Set("Title", msg.Title)
@ -231,15 +127,15 @@ func (p *Provider) postOne(ctx context.Context, httpClient *http.Client, base, t
req.Header.Set("Authorization", "Bearer "+p.authToken)
}
resp, err := httpClient.Do(req)
resp, err := p.httpClient.Do(req)
if err != nil {
return fmt.Errorf("ntfy: post: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode >= 400 {
errBody, _ := io.ReadAll(io.LimitReader(resp.Body, 512))
return fmt.Errorf("ntfy: http %d: %s", resp.StatusCode, strings.TrimSpace(string(errBody)))
body, _ := io.ReadAll(io.LimitReader(resp.Body, 512))
return fmt.Errorf("ntfy: http %d: %s", resp.StatusCode, strings.TrimSpace(string(body)))
}
// Drain body to allow connection reuse.
@ -247,21 +143,20 @@ func (p *Provider) postOne(ctx context.Context, httpClient *http.Client, base, t
return nil
}
// resolveTopic maps a device token to the escaped ntfy topic path (without the
// base URL), so the same topic can be published to one or many push nodes.
// resolveEndpoint maps a device token to the ntfy publish URL.
//
// The token is one of two shapes:
//
// - A plain ntfy topic (possibly hierarchical, e.g. "ns/myapp/user-1") —
// each path segment is escaped so a crafted token can't break out of the
// topic path.
// published to "<baseURL>/<topic>", with each path segment escaped so a
// crafted token can't break out of the topic path.
// - A full UnifiedPush endpoint URL handed to the client by the ntfy
// distributor (e.g. "https://push.example.com/up<random>"). UnifiedPush
// requires the application server to POST to that endpoint, so we accept it
// — but ONLY after verifying its scheme+host match the configured base URL,
// then take only its path as the topic. That turns a device-supplied token
// into a publish only against our own push host, never an arbitrary one.
func (p *Provider) resolveTopic(token string) (string, error) {
// requires the application server to POST to that endpoint verbatim, so we
// use it as-is — but ONLY after verifying its scheme+host match the
// configured base URL. That check turns a device-supplied token into an
// SSRF only against our own push host, never an arbitrary one.
func (p *Provider) resolveEndpoint(token string) (string, error) {
topic := token
if isAbsoluteHTTPURL(token) {
u, err := url.Parse(token)
@ -278,7 +173,10 @@ func (p *Provider) resolveTopic(token string) (string, error) {
return "", fmt.Errorf("ntfy: endpoint host %q does not match configured push host %q", u.Host, base.Host)
}
// Confine the URL form to the SAME publish surface as a bare topic:
// take only the path as the topic, dropping any query/fragment.
// take only the path as the topic and re-build through the per-segment
// escaping below, dropping any query/fragment. So a UnifiedPush
// endpoint token can publish a topic but can't gain arbitrary path or
// query control on the push host beyond what a plain topic already has.
topic = strings.TrimPrefix(u.Path, "/")
if topic == "" {
return "", fmt.Errorf("ntfy: endpoint url %q has no topic path", token)
@ -290,7 +188,7 @@ func (p *Provider) resolveTopic(token string) (string, error) {
for i, seg := range parts {
parts[i] = url.PathEscape(seg)
}
return strings.Join(parts, "/"), nil
return p.baseURL + "/" + strings.Join(parts, "/"), nil
}
// isAbsoluteHTTPURL reports whether s looks like an absolute http(s) URL (the

View File

@ -8,7 +8,6 @@ import (
"net/http/httptest"
"net/url"
"strings"
"sync"
"testing"
"time"
@ -307,136 +306,3 @@ func TestName(t *testing.T) {
t.Errorf("expected Name=ntfy, got %s", p.Name())
}
}
// ----------------------------------------------------------------------------
// Bugboard #858 — cluster fan-out. Each push node runs an independent ntfy with
// no shared store, so a publish must reach EVERY node for the subscriber's
// instance (round-robin DNS picks one) to receive it.
// ----------------------------------------------------------------------------
// fanoutRecorder is a test ntfy node that records the topics it received.
type fanoutRecorder struct {
mu sync.Mutex
topics []string
}
func newFanoutNode(t *testing.T) (*httptest.Server, *fanoutRecorder) {
t.Helper()
rec := &fanoutRecorder{}
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
rec.mu.Lock()
rec.topics = append(rec.topics, strings.TrimPrefix(r.URL.Path, "/"))
rec.mu.Unlock()
w.WriteHeader(http.StatusOK)
}))
return srv, rec
}
func (r *fanoutRecorder) count() int {
r.mu.Lock()
defer r.mu.Unlock()
return len(r.topics)
}
func TestSend_fanout_publishesToAllNodes(t *testing.T) {
s1, r1 := newFanoutNode(t)
defer s1.Close()
s2, r2 := newFanoutNode(t)
defer s2.Close()
s3, r3 := newFanoutNode(t)
defer s3.Close()
p := New(Config{
BaseURL: s1.URL, // base URL still required; fan-out targets come from the resolver
FanoutResolver: func(context.Context) ([]string, error) {
return []string{s1.URL, s2.URL, s3.URL}, nil
},
}, nil)
if err := p.Send(context.Background(), push.PushMessage{DeviceToken: "user-1", Body: "hi"}); err != nil {
t.Fatalf("Send: %v", err)
}
for i, r := range []*fanoutRecorder{r1, r2, r3} {
if r.count() != 1 {
t.Errorf("node %d received %d publishes; want exactly 1 (the publish must reach every node)", i+1, r.count())
}
if r.count() == 1 && r.topics[0] != "user-1" {
t.Errorf("node %d got topic %q; want user-1", i+1, r.topics[0])
}
}
}
func TestSend_fanout_oneNodeDown_stillSucceeds(t *testing.T) {
up, rUp := newFanoutNode(t)
defer up.Close()
down, _ := newFanoutNode(t)
down.Close() // unreachable
p := New(Config{
BaseURL: up.URL,
FanoutResolver: func(context.Context) ([]string, error) {
return []string{up.URL, down.URL}, nil
},
}, nil)
// At least one node accepted it → Send succeeds; the message still reached
// the reachable instances.
if err := p.Send(context.Background(), push.PushMessage{DeviceToken: "t", Body: "x"}); err != nil {
t.Fatalf("Send should succeed when at least one node is up; got %v", err)
}
if rUp.count() != 1 {
t.Errorf("the up node should have received the publish; got %d", rUp.count())
}
}
func TestSend_fanout_allNodesDown_returnsError(t *testing.T) {
d1, _ := newFanoutNode(t)
d1.Close()
d2, _ := newFanoutNode(t)
d2.Close()
p := New(Config{
BaseURL: "http://127.0.0.1:1", // unused for posting; just non-empty
FanoutResolver: func(context.Context) ([]string, error) {
return []string{d1.URL, d2.URL}, nil
},
}, nil)
if err := p.Send(context.Background(), push.PushMessage{DeviceToken: "t", Body: "x"}); err == nil {
t.Fatal("Send should fail when every node is unreachable")
}
}
func TestSend_fanout_resolverEmpty_fallsBackToBaseURL(t *testing.T) {
base, rBase := newFanoutNode(t)
defer base.Close()
p := New(Config{
BaseURL: base.URL,
FanoutResolver: func(context.Context) ([]string, error) { return nil, nil }, // no active nodes
}, nil)
if err := p.Send(context.Background(), push.PushMessage{DeviceToken: "t", Body: "x"}); err != nil {
t.Fatalf("Send: %v", err)
}
if rBase.count() != 1 {
t.Errorf("empty resolver must fall back to the base URL; base got %d publishes", rBase.count())
}
}
func TestSend_fanout_resolverError_fallsBackToBaseURL(t *testing.T) {
base, rBase := newFanoutNode(t)
defer base.Close()
p := New(Config{
BaseURL: base.URL,
FanoutResolver: func(context.Context) ([]string, error) { return nil, context.DeadlineExceeded },
}, nil)
if err := p.Send(context.Background(), push.PushMessage{DeviceToken: "t", Body: "x"}); err != nil {
t.Fatalf("resolver error must not fail the push (fall back to base URL); got %v", err)
}
if rBase.count() != 1 {
t.Errorf("resolver error must fall back to the base URL; base got %d publishes", rBase.count())
}
}

View File

@ -10,39 +10,53 @@ import (
"go.uber.org/zap"
)
// GetRaftStatus queries a local rqlite node's /status endpoint.
func GetRaftStatus(port int) (*RQLiteStatus, error) {
// TransferLeadership attempts to transfer Raft leadership to another voter.
// Used by both the RQLiteManager (on Stop) and the CLI (pre-upgrade).
// Returns nil if this node is not the leader or if transfer succeeds.
func TransferLeadership(port int, logger *zap.Logger) error {
client := &http.Client{Timeout: 5 * time.Second}
resp, err := client.Get(fmt.Sprintf("http://localhost:%d/status", port))
// 1. Check if we're the leader
statusURL := fmt.Sprintf("http://localhost:%d/status", port)
resp, err := client.Get(statusURL)
if err != nil {
return nil, fmt.Errorf("failed to query status: %w", err)
return fmt.Errorf("failed to query status: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read status: %w", err)
return fmt.Errorf("failed to read status: %w", err)
}
var status RQLiteStatus
if err := json.Unmarshal(body, &status); err != nil {
return nil, fmt.Errorf("failed to parse status: %w", err)
return fmt.Errorf("failed to parse status: %w", err)
}
return &status, nil
}
// GetRaftNodes queries a local rqlite node's /nodes endpoint (voters +
// non-voters, with reachability).
func GetRaftNodes(port int) (RQLiteNodes, error) {
client := &http.Client{Timeout: 5 * time.Second}
resp, err := client.Get(fmt.Sprintf("http://localhost:%d/nodes?nonvoters&ver=2&timeout=5s", port))
if err != nil {
return nil, fmt.Errorf("failed to query nodes: %w", err)
if status.Store.Raft.State != "Leader" {
logger.Debug("Not the leader, skipping transfer", zap.Int("port", port))
return nil
}
defer resp.Body.Close()
nodesBody, err := io.ReadAll(resp.Body)
logger.Info("This node is the Raft leader, attempting leadership transfer",
zap.Int("port", port),
zap.String("leader_id", status.Store.Raft.LeaderID))
// 2. Find an eligible voter to transfer to
nodesURL := fmt.Sprintf("http://localhost:%d/nodes?nonvoters&ver=2&timeout=5s", port)
nodesResp, err := client.Get(nodesURL)
if err != nil {
return nil, fmt.Errorf("failed to read nodes: %w", err)
return fmt.Errorf("failed to query nodes: %w", err)
}
// Try ver=2 wrapped format, fall back to plain array.
defer nodesResp.Body.Close()
nodesBody, err := io.ReadAll(nodesResp.Body)
if err != nil {
return fmt.Errorf("failed to read nodes: %w", err)
}
// Try ver=2 wrapped format, fall back to plain array
var nodes RQLiteNodes
var wrapped struct {
Nodes RQLiteNodes `json:"nodes"`
@ -52,28 +66,8 @@ func GetRaftNodes(port int) (RQLiteNodes, error) {
} else {
_ = json.Unmarshal(nodesBody, &nodes)
}
return nodes, nil
}
// TransferLeadership attempts to transfer Raft leadership to another voter.
// Used by both the RQLiteManager (on Stop) and the CLI (pre-upgrade).
// Returns nil if this node is not the leader or if transfer succeeds.
func TransferLeadership(port int, logger *zap.Logger) error {
status, err := GetRaftStatus(port)
if err != nil {
return err
}
if status.Store.Raft.State != "Leader" {
logger.Debug("Not the leader, skipping transfer", zap.Int("port", port))
return nil
}
nodes, err := GetRaftNodes(port)
if err != nil {
return err
}
// Find any reachable voter that is NOT us.
// Find a reachable voter that is NOT us
var targetID string
for _, n := range nodes {
if n.Voter && n.Reachable && n.ID != status.Store.Raft.LeaderID {
@ -81,55 +75,57 @@ func TransferLeadership(port int, logger *zap.Logger) error {
break
}
}
if targetID == "" {
logger.Warn("No eligible voter found for leadership transfer — will rely on SIGTERM graceful step-down",
zap.Int("port", port))
return nil
}
return TransferLeadershipTo(port, targetID, logger)
}
// TransferLeadershipTo transfers Raft leadership to a SPECIFIC target node ID
// (its raft address). The caller is responsible for confirming this node is the
// leader and that targetID is an eligible voter. Tolerant of a missing API
// (404) and a non-OK status — it logs and returns nil so callers treat transfer
// as best-effort.
func TransferLeadershipTo(port int, targetID string, logger *zap.Logger) error {
client := &http.Client{Timeout: 5 * time.Second}
logger.Info("Attempting Raft leadership transfer",
zap.Int("port", port), zap.String("target", targetID))
// 3. Attempt transfer via rqlite v8+ API
// POST /nodes/<target>/transfer-leadership
// If the API doesn't exist (404), fall back to relying on SIGTERM.
transferURL := fmt.Sprintf("http://localhost:%d/nodes/%s/transfer-leadership", port, targetID)
transferResp, err := client.Post(transferURL, "application/json", nil)
if err != nil {
logger.Warn("Leadership transfer request failed", zap.Error(err))
logger.Warn("Leadership transfer request failed, relying on SIGTERM",
zap.Error(err))
return nil
}
transferResp.Body.Close()
switch {
case transferResp.StatusCode == http.StatusNotFound:
logger.Info("Leadership transfer API not available (rqlite version)")
if transferResp.StatusCode == http.StatusNotFound {
logger.Info("Leadership transfer API not available (rqlite version), relying on SIGTERM")
return nil
case transferResp.StatusCode != http.StatusOK:
}
if transferResp.StatusCode != http.StatusOK {
logger.Warn("Leadership transfer returned unexpected status",
zap.Int("status", transferResp.StatusCode))
return nil
}
// Verify.
// 4. Verify transfer
time.Sleep(2 * time.Second)
newStatus, err := GetRaftStatus(port)
verifyResp, err := client.Get(statusURL)
if err != nil {
logger.Info("Could not verify transfer (node may have already stepped down)")
return nil
}
if newStatus.Store.Raft.State != "Leader" {
logger.Info("Leadership transferred successfully",
zap.String("new_leader", newStatus.Store.Raft.LeaderID), zap.Int("port", port))
} else {
logger.Warn("Still leader after transfer attempt", zap.Int("port", port))
defer verifyResp.Body.Close()
verifyBody, _ := io.ReadAll(verifyResp.Body)
var newStatus RQLiteStatus
if err := json.Unmarshal(verifyBody, &newStatus); err == nil {
if newStatus.Store.Raft.State != "Leader" {
logger.Info("Leadership transferred successfully",
zap.String("new_leader", newStatus.Store.Raft.LeaderID),
zap.Int("port", port))
} else {
logger.Warn("Still leader after transfer attempt — will rely on SIGTERM",
zap.Int("port", port))
}
}
return nil
}

View File

@ -118,16 +118,16 @@ func (i *Invoker) Invoke(ctx context.Context, req *InvokeRequest) (*InvokeRespon
// #264). The auth boundary for system triggers is at REGISTRATION
// time (HTTP `POST /v1/functions/{name}/triggers`, or deploy-time
// auto-register from function.yaml), not at firing time.
if !isSystemTrigger(req.TriggerType) && !canInvokeFn(fn, req.CallerWallet) {
// Authorization uses the function we already fetched above —
// CanInvoke would re-`registry.Get` it, a redundant leader-routed
// read on every op (bugboard #708).
return &InvokeResponse{
RequestID: requestID,
Status: InvocationStatusError,
Error: "unauthorized",
DurationMS: time.Since(startTime).Milliseconds(),
}, ErrUnauthorized
if !isSystemTrigger(req.TriggerType) {
authorized, err := i.CanInvoke(ctx, req.Namespace, req.FunctionName, req.CallerWallet)
if err != nil || !authorized {
return &InvokeResponse{
RequestID: requestID,
Status: InvocationStatusError,
Error: "unauthorized",
DurationMS: time.Since(startTime).Milliseconds(),
}, ErrUnauthorized
}
}
// Get environment variables
@ -493,7 +493,7 @@ func (i *Invoker) BatchInvoke(ctx context.Context, req *BatchInvokeRequest) (*Ba
func isSystemTrigger(t TriggerType) bool {
switch t {
case TriggerTypeCron, TriggerTypePubSub, TriggerTypeDatabase,
TriggerTypeTimer, TriggerTypeJob, TriggerTypeInternal:
TriggerTypeTimer, TriggerTypeJob:
return true
}
return false
@ -504,19 +504,20 @@ func (i *Invoker) CanInvoke(ctx context.Context, namespace, functionName string,
if err != nil {
return false, err
}
return canInvokeFn(fn, callerWallet), nil
}
// canInvokeFn is the pure authorization decision for an already-fetched
// function, so the hot Invoke path doesn't re-read the registry (bugboard
// #708). Public functions are open; a private function only requires that the
// caller has SOME identity — the auth middleware already verified namespace
// membership before the function ran.
func canInvokeFn(fn *Function, callerWallet string) bool {
// Public functions can be invoked by anyone (auth middleware allows
// the request through without credentials).
if fn.IsPublic {
return true
return true, nil
}
return strings.TrimSpace(callerWallet) != ""
// Private function: require an authenticated caller. The auth
// middleware has already verified the caller belongs to this
// namespace (either via JWT `namespace` claim or via API-key
// namespace lookup) before this function ever runs, so the only
// thing we need to confirm here is that the caller has SOME
// identity at all (i.e. the request wasn't anonymous).
return strings.TrimSpace(callerWallet) != "", nil
}
// GetFunctionInfo returns basic info about a function for invocation.

View File

@ -6,9 +6,7 @@ import (
"database/sql"
"fmt"
"io"
"strconv"
"strings"
"sync"
"time"
"github.com/DeBrosOfficial/network/pkg/ipfs"
@ -17,27 +15,6 @@ import (
"go.uber.org/zap"
)
// registryCacheTTL bounds how long function metadata + env vars are cached
// in-process before re-reading rqlite. Bugboard #708: every function_invoke
// previously did 3 uncached `weak` reads (Get, a redundant Get inside
// CanInvoke, and GetEnvVars), each forwarded to the raft leader — ~820ms of
// pure pre-flight tax per op when the leader is a distant node. With a short
// TTL + explicit invalidation on deploy/enable/disable/delete, a burst of RPCs
// (e.g. a call setup) reads metadata once instead of N times. The TTL is a
// backstop; correctness comes from the explicit invalidation, so cross-node
// propagation of a deploy/disable is bounded to this TTL.
const registryCacheTTL = 5 * time.Second
type fnCacheEntry struct {
fn *Function
at time.Time
}
type envCacheEntry struct {
env map[string]string
at time.Time
}
// Ensure Registry implements FunctionRegistry and InvocationLogger interfaces.
var _ FunctionRegistry = (*Registry)(nil)
var _ InvocationLogger = (*Registry)(nil)
@ -50,12 +27,6 @@ type Registry struct {
ipfsAPIURL string
logger *zap.Logger
tableName string
// Metadata cache (bugboard #708) — see registryCacheTTL.
cacheTTL time.Duration
cacheMu sync.RWMutex
fnCache map[string]fnCacheEntry // key: namespace\x00name\x00version
envCache map[string]envCacheEntry // key: functionID
}
// RegistryConfig holds configuration for the Registry.
@ -71,78 +42,9 @@ func NewRegistry(db rqlite.Client, ipfsClient ipfs.IPFSClient, cfg RegistryConfi
ipfsAPIURL: cfg.IPFSAPIURL,
logger: logger,
tableName: "functions",
cacheTTL: registryCacheTTL,
fnCache: make(map[string]fnCacheEntry),
envCache: make(map[string]envCacheEntry),
}
}
// --- metadata cache (bugboard #708) ---
//
// The cached *Function and env map are SHARED with all callers and MUST be
// treated as read-only — no consumer in pkg/serverless mutates them today, and
// none may, or it would corrupt the cache for concurrent readers.
func fnCacheKey(namespace, name string, version int) string {
return namespace + "\x00" + name + "\x00" + strconv.Itoa(version)
}
func (r *Registry) cachedFn(key string) (*Function, bool) {
r.cacheMu.RLock()
e, ok := r.fnCache[key]
r.cacheMu.RUnlock()
if !ok || time.Since(e.at) > r.cacheTTL {
return nil, false
}
return e.fn, true
}
func (r *Registry) storeFn(key string, fn *Function) {
r.cacheMu.Lock()
r.fnCache[key] = fnCacheEntry{fn: fn, at: time.Now()}
r.cacheMu.Unlock()
}
func (r *Registry) cachedEnv(functionID string) (map[string]string, bool) {
r.cacheMu.RLock()
e, ok := r.envCache[functionID]
r.cacheMu.RUnlock()
if !ok || time.Since(e.at) > r.cacheTTL {
return nil, false
}
return e.env, true
}
func (r *Registry) storeEnv(functionID string, env map[string]string) {
r.cacheMu.Lock()
r.envCache[functionID] = envCacheEntry{env: env, at: time.Now()}
r.cacheMu.Unlock()
}
// invalidateFn drops every cached version of (namespace, name). Called on
// deploy/enable/disable/delete so a metadata change is never masked by the
// cache beyond the write itself.
func (r *Registry) invalidateFn(namespace, name string) {
prefix := strings.TrimSpace(namespace) + "\x00" + strings.TrimSpace(name) + "\x00"
r.cacheMu.Lock()
for k := range r.fnCache {
if strings.HasPrefix(k, prefix) {
delete(r.fnCache, k)
}
}
r.cacheMu.Unlock()
}
// invalidateEnv drops the cached env vars for a function ID. A redeploy REUSES
// the existing function ID (Register: id = oldFn.ID) and rewrites env vars
// under it, so without this an env-var change would be masked by the cache for
// up to the TTL.
func (r *Registry) invalidateEnv(functionID string) {
r.cacheMu.Lock()
delete(r.envCache, functionID)
r.cacheMu.Unlock()
}
// Register deploys a new function or updates an existing one.
func (r *Registry) Register(ctx context.Context, fn *FunctionDefinition, wasmBytes []byte) (*Function, error) {
if fn == nil {
@ -226,9 +128,6 @@ func (r *Registry) Register(ctx context.Context, fn *FunctionDefinition, wasmByt
return nil, &DeployError{FunctionName: fn.Name, Cause: err}
}
r.invalidateFn(fn.Namespace, fn.Name)
r.invalidateEnv(id)
r.logger.Info("Function registered",
zap.String("id", id),
zap.String("name", fn.Name),
@ -247,12 +146,6 @@ func (r *Registry) Get(ctx context.Context, namespace, name string, version int)
namespace = strings.TrimSpace(namespace)
name = strings.TrimSpace(name)
// Cache hit (bugboard #708): skip the leader-routed weak read entirely.
cacheKey := fnCacheKey(namespace, name, version)
if fn, ok := r.cachedFn(cacheKey); ok {
return fn, nil
}
var query string
var args []interface{}
@ -291,17 +184,13 @@ func (r *Registry) Get(ctx context.Context, namespace, name string, version int)
}
if len(functions) == 0 {
// Do NOT cache misses — a just-deployed function must be visible
// immediately on the next call, not after the TTL.
if version == 0 {
return nil, ErrFunctionNotFound
}
return nil, ErrVersionNotFound
}
fn := r.rowToFunction(&functions[0])
r.storeFn(cacheKey, fn)
return fn, nil
return r.rowToFunction(&functions[0]), nil
}
// List returns all functions for a namespace.
@ -363,7 +252,6 @@ func (r *Registry) SetEnabled(ctx context.Context, namespace, name string, enabl
if rowsAffected == 0 {
return ErrFunctionNotFound
}
r.invalidateFn(namespace, name)
r.logger.Info("Function enabled-state updated",
zap.String("namespace", namespace),
zap.String("name", name),
@ -402,8 +290,6 @@ func (r *Registry) Delete(ctx context.Context, namespace, name string, version i
return ErrVersionNotFound
}
r.invalidateFn(namespace, name)
r.logger.Info("Function deleted",
zap.String("namespace", namespace),
zap.String("name", name),
@ -435,10 +321,6 @@ func (r *Registry) GetWASMBytes(ctx context.Context, wasmCID string) ([]byte, er
// GetEnvVars retrieves environment variables for a function.
func (r *Registry) GetEnvVars(ctx context.Context, functionID string) (map[string]string, error) {
if env, ok := r.cachedEnv(functionID); ok {
return env, nil
}
query := `SELECT key, value FROM function_env_vars WHERE function_id = ?`
var rows []envVarRow
@ -451,7 +333,6 @@ func (r *Registry) GetEnvVars(ctx context.Context, functionID string) (map[strin
envVars[row.Key] = row.Value
}
r.storeEnv(functionID, envVars)
return envVars, nil
}

View File

@ -1,118 +0,0 @@
package serverless
import (
"testing"
"time"
"go.uber.org/zap"
)
// Bugboard #708 — function metadata + env vars are cached in-process so a burst
// of invokes doesn't pay a leader-routed weak read per op. These pin the cache
// hit/miss/TTL/invalidation behavior and the dedup'd authorization decision.
func newTestRegistry() *Registry {
return NewRegistry(NewMockRQLite(), NewMockIPFSClient(), RegistryConfig{}, zap.NewNop())
}
func TestRegistryCache_hitAndInvalidate(t *testing.T) {
r := newTestRegistry()
key := fnCacheKey("ns", "fn", 0)
fn := &Function{ID: "id-1", Name: "fn", Namespace: "ns"}
if _, ok := r.cachedFn(key); ok {
t.Fatal("empty cache must miss")
}
r.storeFn(key, fn)
got, ok := r.cachedFn(key)
if !ok || got != fn {
t.Fatalf("expected cache hit returning the stored fn; ok=%v got=%v", ok, got)
}
// Deploy/enable/disable/delete must drop every cached version.
r.storeFn(fnCacheKey("ns", "fn", 3), &Function{ID: "id-3", Name: "fn", Namespace: "ns"})
r.invalidateFn("ns", "fn")
if _, ok := r.cachedFn(key); ok {
t.Error("invalidateFn must drop the version-0 entry")
}
if _, ok := r.cachedFn(fnCacheKey("ns", "fn", 3)); ok {
t.Error("invalidateFn must drop ALL versions of the function")
}
}
func TestRegistryCache_invalidateScopedToFunction(t *testing.T) {
r := newTestRegistry()
r.storeFn(fnCacheKey("ns", "keep", 0), &Function{ID: "k", Name: "keep", Namespace: "ns"})
r.storeFn(fnCacheKey("ns", "drop", 0), &Function{ID: "d", Name: "drop", Namespace: "ns"})
r.invalidateFn("ns", "drop")
if _, ok := r.cachedFn(fnCacheKey("ns", "drop", 0)); ok {
t.Error("target function must be invalidated")
}
if _, ok := r.cachedFn(fnCacheKey("ns", "keep", 0)); !ok {
t.Error("a DIFFERENT function must NOT be invalidated (prefix must include the null separator)")
}
}
func TestRegistryCache_ttlExpiry(t *testing.T) {
r := newTestRegistry()
key := fnCacheKey("ns", "fn", 0)
// Backdate the entry beyond the TTL.
r.fnCache[key] = fnCacheEntry{fn: &Function{ID: "x"}, at: time.Now().Add(-2 * r.cacheTTL)}
if _, ok := r.cachedFn(key); ok {
t.Error("an entry older than the TTL must be treated as a miss")
}
}
func TestRegistryCache_envHitAndTTL(t *testing.T) {
r := newTestRegistry()
if _, ok := r.cachedEnv("fid"); ok {
t.Fatal("empty env cache must miss")
}
r.storeEnv("fid", map[string]string{"K": "V"})
if env, ok := r.cachedEnv("fid"); !ok || env["K"] != "V" {
t.Fatalf("expected env cache hit; ok=%v env=%v", ok, env)
}
r.envCache["fid"] = envCacheEntry{env: map[string]string{"K": "V"}, at: time.Now().Add(-2 * r.cacheTTL)}
if _, ok := r.cachedEnv("fid"); ok {
t.Error("env entry older than the TTL must miss")
}
}
func TestRegistryCache_envInvalidatedOnRedeploy(t *testing.T) {
// A redeploy REUSES the function ID (Register: id = oldFn.ID) and rewrites
// env vars under it, so Register must drop the env cache for that ID — else
// a changed env var (e.g. a rotated endpoint) is masked for up to the TTL.
r := newTestRegistry()
r.storeEnv("fid", map[string]string{"K": "old"})
if env, ok := r.cachedEnv("fid"); !ok || env["K"] != "old" {
t.Fatal("precondition: env should be cached")
}
r.invalidateEnv("fid") // what Register now calls
if _, ok := r.cachedEnv("fid"); ok {
t.Error("env cache must be invalidated on redeploy (reused ID); a changed env var must not be served stale")
}
}
func TestRegistryCache_keyDistinctNoCollision(t *testing.T) {
// Guard the null-separated key: "a"+"bc" must not collide with "ab"+"c".
if fnCacheKey("a", "bc", 0) == fnCacheKey("ab", "c", 0) {
t.Error("cache keys must not collide across namespace/name boundaries")
}
}
func TestCanInvokeFn(t *testing.T) {
if !canInvokeFn(&Function{IsPublic: true}, "") {
t.Error("public function must be invokable by an anonymous caller")
}
if canInvokeFn(&Function{IsPublic: false}, "") {
t.Error("private function must reject an empty (anonymous) caller")
}
if canInvokeFn(&Function{IsPublic: false}, " ") {
t.Error("private function must reject a whitespace-only caller")
}
if !canInvokeFn(&Function{IsPublic: false}, "wallet-abc") {
t.Error("private function must accept an identified caller")
}
}

View File

@ -30,11 +30,6 @@ const (
TriggerTypePubSub TriggerType = "pubsub"
TriggerTypeTimer TriggerType = "timer"
TriggerTypeJob TriggerType = "job"
// TriggerTypeInternal marks a gateway-initiated invocation with no end-user
// caller (e.g. the auth claims-provider hook at JWT mint time, bugboard
// #548). Treated as a system trigger so the per-caller authorization check
// is skipped — the gateway is the trusted invoker.
TriggerTypeInternal TriggerType = "internal"
)
// JobStatus represents the current state of a background job.
@ -239,8 +234,8 @@ type FunctionDefinition struct {
// When WSPersistent is true, the function exports ws_open/ws_frame/ws_close
// instead of using the default per-frame stateless model.
WSPersistent bool `json:"ws_persistent,omitempty"`
WSIdleTimeoutSec int `json:"ws_idle_timeout_sec,omitempty"` // 0 = no idle timeout
WSMaxFrameBytes int `json:"ws_max_frame_bytes,omitempty"` // 0 = use default 256 KB
WSIdleTimeoutSec int `json:"ws_idle_timeout_sec,omitempty"` // 0 = no idle timeout
WSMaxFrameBytes int `json:"ws_max_frame_bytes,omitempty"` // 0 = use default 256 KB
WSMaxInflightPerConn int `json:"ws_max_inflight_per_conn,omitempty"` // 0 = use default 64
// RawHTTPResponse enables raw-HTTP-response mode (bugboard #835): the
@ -289,11 +284,11 @@ type Function struct {
// InvocationContext provides context for a function invocation.
type InvocationContext struct {
RequestID string `json:"request_id"`
FunctionID string `json:"function_id"`
FunctionName string `json:"function_name"`
Namespace string `json:"namespace"`
CallerWallet string `json:"caller_wallet,omitempty"`
RequestID string `json:"request_id"`
FunctionID string `json:"function_id"`
FunctionName string `json:"function_name"`
Namespace string `json:"namespace"`
CallerWallet string `json:"caller_wallet,omitempty"`
// CallerIP is the source IP of the request, populated by HTTP/WS handlers.
// Used by the multi-tier rate limiter as a fallback bucket for anonymous
// (no-wallet) callers.

View File

@ -1,6 +1,6 @@
{
"name": "@debros/orama",
"version": "0.122.55",
"version": "0.122.47",
"description": "TypeScript SDK for Orama Network - Database, PubSub, Cache, Storage, Vault, and more",
"type": "module",
"main": "./dist/index.js",

View File

@ -167,41 +167,6 @@ export class HttpClient {
return this.baseURL;
}
/**
* Normalize any thrown error into a typed SDKError so callers can branch on
* `.code`/`.httpStatus` instead of string-matching a bare platform
* `TypeError: Network request failed` (bugboard #129).
*
* - SDKError (an HTTP error response) passes through unchanged.
* - An AbortError (our own per-request timeout firing) code "TIMEOUT".
* - Anything else (fetch rejects with a TypeError on DNS failure, connection
* refused, offline, or TLS error) code "NETWORK_ERROR".
*
* In every network case httpStatus is 0 (no HTTP response was received), which
* is how the app distinguishes "couldn't reach the gateway" from a real 4xx/5xx.
*/
private normalizeError(error: unknown, timeoutMs: number): SDKError {
if (error instanceof SDKError) {
return error;
}
const name = (error as { name?: string })?.name;
const message = error instanceof Error ? error.message : String(error);
if (name === "AbortError") {
return new SDKError(
`request timed out after ${timeoutMs}ms`,
0,
"TIMEOUT",
{ cause: name }
);
}
return new SDKError(
message || "network request failed",
0,
"NETWORK_ERROR",
{ cause: name }
);
}
async request<T = any>(
method: "GET" | "POST" | "PUT" | "DELETE",
path: string,
@ -333,14 +298,18 @@ export class HttpClient {
}
}
// Normalize native errors (TypeError, AbortError) into a typed SDKError
// so the app gets a stable `.code`/`.httpStatus` instead of a bare
// platform "Network request failed" (bugboard #129).
const sdkError = this.normalizeError(error, requestTimeout);
// Call the network error callback if configured. This allows the app to
// trigger gateway failover.
// Call the network error callback if configured
// This allows the app to trigger gateway failover
if (this.onNetworkError) {
// Convert native errors (TypeError, AbortError) to SDKError for the callback
const sdkError =
error instanceof SDKError
? error
: new SDKError(
error instanceof Error ? error.message : String(error),
0, // httpStatus 0 indicates network-level failure
"NETWORK_ERROR"
);
this.onNetworkError(sdkError, {
method,
path,
@ -349,7 +318,7 @@ export class HttpClient {
});
}
throw sdkError;
throw error;
} finally {
clearTimeout(timeoutId);
}

View File

@ -1,88 +0,0 @@
import { describe, it, expect, vi } from "vitest";
import { HttpClient } from "../../../src/core/http";
import { SDKError } from "../../../src/errors";
/**
* Bugboard #129 typed network errors.
*
* Before this fix the HttpClient re-threw the raw platform error on a
* network-level failure, so a caller (e.g. AnChat's JwtSession) could only
* tell "couldn't reach the gateway" apart from a real HTTP error by
* string-matching `TypeError: Network request failed`. These guards lock in
* that every transport failure surfaces as a typed SDKError with httpStatus 0
* and a stable `.code`, while genuine HTTP errors keep their status/code.
*/
describe("Bug #129 — HttpClient surfaces typed network errors", () => {
function client(fetchImpl: typeof fetch, onNetworkError?: any) {
return new HttpClient({
baseURL: "https://gw.example",
maxRetries: 0,
timeout: 5000,
fetch: fetchImpl,
onNetworkError,
});
}
it("maps a fetch TypeError (connection failure) to SDKError NETWORK_ERROR / status 0", async () => {
const fetchSpy = vi.fn(async () => {
throw new TypeError("Network request failed");
});
const err = await client(fetchSpy as any)
.post("/v1/auth/refresh", { refresh_token: "x" })
.catch((e) => e);
expect(err).toBeInstanceOf(SDKError);
expect(err.code).toBe("NETWORK_ERROR");
expect(err.httpStatus).toBe(0);
// Original platform message is preserved for diagnostics.
expect(err.message).toContain("Network request failed");
});
it("maps an AbortError (timeout) to SDKError TIMEOUT / status 0", async () => {
const fetchSpy = vi.fn(async () => {
const e = new Error("aborted");
e.name = "AbortError";
throw e;
});
const err = await client(fetchSpy as any)
.get("/v1/auth/challenge")
.catch((e) => e);
expect(err).toBeInstanceOf(SDKError);
expect(err.code).toBe("TIMEOUT");
expect(err.httpStatus).toBe(0);
expect(err.message).toContain("5000ms");
});
it("passes a real HTTP error through unchanged (not masked as NETWORK_ERROR)", async () => {
const fetchSpy = vi.fn(
async () =>
new Response(JSON.stringify({ error: "nope", code: "BAD_TOKEN" }), {
status: 401,
headers: { "content-type": "application/json" },
})
);
const err = await client(fetchSpy as any)
.post("/v1/auth/refresh", { refresh_token: "x" })
.catch((e) => e);
expect(err).toBeInstanceOf(SDKError);
expect(err.httpStatus).toBe(401);
expect(err.code).toBe("BAD_TOKEN");
});
it("hands the typed error (not the raw TypeError) to the onNetworkError callback", async () => {
const seen: SDKError[] = [];
const fetchSpy = vi.fn(async () => {
throw new TypeError("Failed to fetch");
});
await client(fetchSpy as any, (e: SDKError) => seen.push(e))
.get("/v1/db/read")
.catch(() => {});
expect(seen).toHaveLength(1);
expect(seen[0]).toBeInstanceOf(SDKError);
expect(seen[0].code).toBe("NETWORK_ERROR");
expect(seen[0].httpStatus).toBe(0);
});
});