feat(auth): refresh-token custom claims hook (#548)

Custom JWT claims survive token refresh: migration 031 adds the
custom-claims column to refresh tokens, the new gateway ClaimsProvider
re-resolves claims on refresh, and the serverless invoke path carries
them through. Includes refresh-rotation, WS-JWT middleware, and
claims-provider test coverage.
This commit is contained in:
anonpenguin23 2026-06-12 08:05:27 +03:00
parent 8472861ed3
commit d113b75497
18 changed files with 697 additions and 54 deletions

View File

@ -1 +1 @@
0.122.47
0.122.49

View File

@ -0,0 +1,15 @@
-- =============================================================================
-- 031_refresh_token_custom_claims.sql
--
-- Carry the additive JWT custom claims (e.g. the namespace's account_id from
-- the auth-claims-provider hook, bugboard #548/#920) ALONGSIDE the refresh
-- token, so a rotated access token keeps the same claims without re-invoking
-- the namespace's claims-provider function on every 15-min refresh (the
-- refresh path is the latency-critical VoIP-wake path, bugboard #125).
--
-- Resolved once at /v1/auth/verify mint time, stored here, and replayed +
-- propagated across each rotation. NULL/absent = no custom claims (the default
-- for every namespace without a claims-provider) → fully backward compatible.
-- =============================================================================
ALTER TABLE refresh_tokens ADD COLUMN custom_claims TEXT;

View File

@ -43,9 +43,10 @@ type AuthService interface {
// Verifies signature, expiration, and issuer.
ParseAndVerifyJWT(token string) (*JWTClaims, error)
// GenerateJWT creates a new signed JWT with the specified claims and TTL.
// GenerateJWT creates a new signed JWT with the specified subject, TTL, and
// optional additive custom claims (nil = none; bugboard #548).
// Returns: token, expirationUnix, error.
GenerateJWT(namespace, subject string, ttl time.Duration) (string, int64, error)
GenerateJWT(namespace, subject string, ttl time.Duration, custom map[string]string) (string, int64, error)
// RegisterApp registers a new client application with the gateway.
// Returns an application ID that can be used for OAuth flows.

View File

@ -4,6 +4,7 @@ import (
"crypto/hmac"
"crypto/sha256"
"encoding/hex"
"encoding/json"
)
// sha256Hex returns the lowercase hex-encoded SHA-256 hash of the input string.
@ -22,3 +23,34 @@ func HmacSHA256Hex(data, secret string) string {
mac.Write([]byte(data))
return hex.EncodeToString(mac.Sum(nil))
}
// marshalClaims serializes additive JWT custom claims for storage alongside a
// refresh token (bugboard #548). Empty/nil → "" so the column stays NULL-ish
// and absent claims read back as nil.
func marshalClaims(m map[string]string) string {
if len(m) == 0 {
return ""
}
b, err := json.Marshal(m)
if err != nil {
return ""
}
return string(b)
}
// unmarshalClaims is the inverse of marshalClaims. An empty string or any
// malformed value yields nil (fail-soft — a corrupt claims blob must never
// break token rotation; the token simply rotates without custom claims).
func unmarshalClaims(s string) map[string]string {
if s == "" {
return nil
}
var m map[string]string
if err := json.Unmarshal([]byte(s), &m); err != nil {
return nil
}
if len(m) == 0 {
return nil
}
return m
}

View File

@ -182,15 +182,22 @@ func (s *Service) ParseAndVerifyJWT(token string) (*JWTClaims, error) {
return &claims, nil
}
func (s *Service) GenerateJWT(ns, subject string, ttl time.Duration) (string, int64, error) {
// GenerateJWT mints a signed access token. `custom` carries additive
// app-defined claims (e.g. the namespace's account_id from the claims-provider
// hook, bugboard #548) under the top-level "custom" object — read back via
// JWTClaims.Custom / oh.GetCallerClaim. Pass nil for none. Reserved claims
// (sub/iss/aud/iat/nbf/exp/namespace) are always gateway-controlled and cannot
// be overridden by `custom` (the caller is responsible for not putting
// reserved keys here; the claims-provider path sanitizes them out upstream).
func (s *Service) GenerateJWT(ns, subject string, ttl time.Duration, custom map[string]string) (string, int64, error) {
// Prefer EdDSA when available
if s.preferEdDSA && s.edSigningKey != nil {
return s.generateEdDSAJWT(ns, subject, ttl)
return s.generateEdDSAJWT(ns, subject, ttl, custom)
}
return s.generateRSAJWT(ns, subject, ttl)
return s.generateRSAJWT(ns, subject, ttl, custom)
}
func (s *Service) generateEdDSAJWT(ns, subject string, ttl time.Duration) (string, int64, error) {
func (s *Service) generateEdDSAJWT(ns, subject string, ttl time.Duration, custom map[string]string) (string, int64, error) {
if s.edSigningKey == nil {
return "", 0, errors.New("EdDSA signing key unavailable")
}
@ -211,6 +218,9 @@ func (s *Service) generateEdDSAJWT(ns, subject string, ttl time.Duration) (strin
"exp": exp.Unix(),
"namespace": ns,
}
if len(custom) > 0 {
payload["custom"] = custom
}
pb, _ := json.Marshal(payload)
hb64 := base64.RawURLEncoding.EncodeToString(hb)
pb64 := base64.RawURLEncoding.EncodeToString(pb)
@ -220,7 +230,7 @@ func (s *Service) generateEdDSAJWT(ns, subject string, ttl time.Duration) (strin
return signingInput + "." + sb64, exp.Unix(), nil
}
func (s *Service) generateRSAJWT(ns, subject string, ttl time.Duration) (string, int64, error) {
func (s *Service) generateRSAJWT(ns, subject string, ttl time.Duration, custom map[string]string) (string, int64, error) {
if s.signingKey == nil {
return "", 0, errors.New("signing key unavailable")
}
@ -241,6 +251,9 @@ func (s *Service) generateRSAJWT(ns, subject string, ttl time.Duration) (string,
"exp": exp.Unix(),
"namespace": ns,
}
if len(custom) > 0 {
payload["custom"] = custom
}
pb, _ := json.Marshal(payload)
hb64 := base64.RawURLEncoding.EncodeToString(hb)
pb64 := base64.RawURLEncoding.EncodeToString(pb)

View File

@ -31,8 +31,15 @@ type rotationMockORMDB struct {
client.DatabaseClient
mu sync.Mutex
subjectByToken map[string]string // hashedToken -> subject (nil/missing = "invalid")
claimsByToken map[string]string // hashedToken -> custom_claims JSON (bugboard #548)
inserted int // count of INSERTs (new refresh-token rows)
subjects map[string]string // subject -> last hashed token inserted
// selectErrRemaining: number of upcoming "SELECT subject" calls that
// should return selectErr (simulates a transient rqlite leader outage).
// Decremented per matching call; 0 = serve normally (bugboard #125).
selectErr error
selectErrRemaining int
selectAttemptsTaken int
}
func (m *rotationMockORMDB) Query(_ context.Context, sql string, args ...interface{}) (*client.QueryResult, error) {
@ -45,14 +52,23 @@ func (m *rotationMockORMDB) Query(_ context.Context, sql string, args ...interfa
if containsCI(sql, "SELECT id FROM namespaces") {
return &client.QueryResult{Count: 1, Rows: [][]interface{}{{int64(1)}}}, nil
}
// SELECT subject for the refresh-token lookup.
if containsCI(sql, "SELECT subject FROM refresh_tokens") {
// SELECT subject (+ custom_claims, bugboard #548) for the lookup.
if containsCI(sql, "SELECT subject") && containsCI(sql, "FROM refresh_tokens") {
m.selectAttemptsTaken++
if m.selectErrRemaining > 0 {
m.selectErrRemaining--
return nil, m.selectErr
}
if len(args) < 2 {
return &client.QueryResult{Count: 0}, nil
}
hashedTok, _ := args[1].(string)
if subj, ok := m.subjectByToken[hashedTok]; ok && subj != "" {
return &client.QueryResult{Count: 1, Rows: [][]interface{}{{subj}}}, nil
claims := ""
if m.claimsByToken != nil {
claims = m.claimsByToken[hashedTok]
}
return &client.QueryResult{Count: 1, Rows: [][]interface{}{{subj, claims}}}, nil
}
return &client.QueryResult{Count: 0}, nil
}
@ -71,6 +87,14 @@ func (m *rotationMockORMDB) Query(_ context.Context, sql string, args ...interfa
m.subjectByToken = map[string]string{}
}
m.subjectByToken[hashedTok] = subj
// custom_claims is the LAST arg (bugboard #548) — capture it so
// rotation-propagation tests can assert it carries forward.
if m.claimsByToken == nil {
m.claimsByToken = map[string]string{}
}
if cc, ok := args[len(args)-1].(string); ok {
m.claimsByToken[hashedTok] = cc
}
}
return &client.QueryResult{Count: 1}, nil
}
@ -369,3 +393,120 @@ func TestRefreshToken_RotatedTokenReplayFails(t *testing.T) {
t.Fatal("expected error reusing rotated token, got nil")
}
}
// Bugboard #125: a TRANSIENT rqlite error on the lookup (leader briefly
// unavailable during a rolling restart) must surface as ErrRefreshTransient
// (→ 503, retryable) — NOT "invalid or expired" (→ 401, full SIWE re-auth,
// impossible on a locked device answering a VoIP-woken call).
func TestRefreshToken_transientSelectError_returnsTransient(t *testing.T) {
s, ormDB, _ := newRotationTestService(t)
const refresh = "valid-but-leader-down"
ormDB.subjectByToken[sha256Hex(refresh)] = "0xWALLET"
// Every lookup attempt across the whole retry window errors.
ormDB.selectErr = errors.New("rqlite: leadership lost")
ormDB.selectErrRemaining = 99
_, _, _, _, err := s.RefreshToken(context.Background(), refresh, "anchat-test")
if !errors.Is(err, ErrRefreshTransient) {
t.Fatalf("err = %v, want ErrRefreshTransient (a valid token must not 401 during a leader outage)", err)
}
}
// The lookup is retried, so a brief blip recovers transparently within one
// refresh call (no client-visible failure at all).
func TestRefreshToken_selectRecoversAfterRetry(t *testing.T) {
s, ormDB, _ := newRotationTestService(t)
const refresh = "valid-blips-then-ok"
ormDB.subjectByToken[sha256Hex(refresh)] = "0xWALLET"
ormDB.selectErr = errors.New("rqlite: leadership lost")
ormDB.selectErrRemaining = refreshSelectRetries - 1 // fail all but the last attempt
access, newRefresh, subj, _, err := s.RefreshToken(context.Background(), refresh, "anchat-test")
if err != nil {
t.Fatalf("RefreshToken should recover after transient blips: %v", err)
}
if access == "" || newRefresh == "" || subj != "0xWALLET" {
t.Errorf("recovered refresh incomplete: access=%q newRefresh=%q subj=%q", access, newRefresh, subj)
}
}
// A transient error on the CAS write (revoke) is also retryable, not a 401.
func TestRefreshToken_transientUpdateError_returnsTransient(t *testing.T) {
s, ormDB, rq := newRotationTestService(t)
const refresh = "valid-cas-write-down"
ormDB.subjectByToken[sha256Hex(refresh)] = "0xWALLET"
rq.execErrNext = []error{errors.New("rqlite: write failed, no leader")}
_, _, _, _, err := s.RefreshToken(context.Background(), refresh, "anchat-test")
if !errors.Is(err, ErrRefreshTransient) {
t.Fatalf("err = %v, want ErrRefreshTransient on a transient CAS write error", err)
}
}
// A genuinely unknown token must remain a hard invalid (401), NOT be masked as
// transient — the distinction is the whole point of the #125 fix.
func TestRefreshToken_unknownToken_isNotTransient(t *testing.T) {
s, _, _ := newRotationTestService(t)
_, _, _, _, err := s.RefreshToken(context.Background(), "never-existed", "anchat-test")
if err == nil {
t.Fatal("expected error for unknown token")
}
if errors.Is(err, ErrRefreshTransient) {
t.Errorf("unknown token must be a genuine invalid (401), not transient (503): %v", err)
}
}
// mockClaimsResolver is a fixed claims-provider stand-in for the mint tests.
type mockClaimsResolver struct{ claims map[string]string }
func (m mockClaimsResolver) ResolveClaims(_ context.Context, _, _ string) map[string]string {
return m.claims
}
// Bugboard #548: claims resolved at IssueTokens (login) must be stored with the
// refresh token AND replayed into the rotated access token — so account_id
// survives the 15-min refresh without re-invoking the provider.
func TestRefreshToken_propagatesCustomClaims(t *testing.T) {
s, ormDB, _ := newRotationTestService(t)
s.SetClaimsResolver(mockClaimsResolver{claims: map[string]string{"account_id": "u-999"}})
// Login mint — IssueTokens resolves + stores the claims with the refresh row.
_, refresh, _, err := s.IssueTokens(context.Background(), "0xWALLET", "anchat-test")
if err != nil {
t.Fatalf("IssueTokens: %v", err)
}
if got := ormDB.claimsByToken[sha256Hex(refresh)]; got != `{"account_id":"u-999"}` {
t.Fatalf("claims not stored with refresh token; got %q", got)
}
// Refresh — the rotated access token must carry account_id, and the NEW
// refresh row must propagate the stored claims.
access, newRefresh, _, _, err := s.RefreshToken(context.Background(), refresh, "anchat-test")
if err != nil {
t.Fatalf("RefreshToken: %v", err)
}
claims, err := s.ParseAndVerifyJWT(access)
if err != nil {
t.Fatalf("ValidateJWT: %v", err)
}
if claims.Custom["account_id"] != "u-999" {
t.Errorf("rotated access token lost account_id; custom=%v", claims.Custom)
}
if got := ormDB.claimsByToken[sha256Hex(newRefresh)]; got != `{"account_id":"u-999"}` {
t.Errorf("rotation did not propagate claims to the new row; got %q", got)
}
// Second rotation hop (N+1 → N+2): the claim must survive repeated
// rotations, not just the first — the propagation is the whole point.
access2, _, _, _, err := s.RefreshToken(context.Background(), newRefresh, "anchat-test")
if err != nil {
t.Fatalf("second RefreshToken: %v", err)
}
claims2, err := s.ParseAndVerifyJWT(access2)
if err != nil {
t.Fatalf("ParseAndVerifyJWT (2nd): %v", err)
}
if claims2.Custom["account_id"] != "u-999" {
t.Errorf("account_id lost across the second rotation; custom=%v", claims2.Custom)
}
}

View File

@ -35,7 +35,8 @@ type Service struct {
edKeyID string
preferEdDSA bool
defaultNS string
apiKeyHMACSecret string // HMAC secret for hashing API keys before storage
apiKeyHMACSecret string // HMAC secret for hashing API keys before storage
claimsResolver ClaimsResolver // namespace claims-provider hook (bugboard #548); nil = none
}
func NewService(logger *logging.ColoredLogger, orm client.NetworkClient, signingKeyPEM string, defaultNS string) (*Service, error) {
@ -84,6 +85,28 @@ func (s *Service) SetRqliteClient(db rqlite.Client) {
s.db = db
}
// ClaimsResolver resolves additive, namespace-defined JWT custom claims for an
// authenticated wallet at token-mint time (bugboard #548/#920). The concrete
// implementation invokes the namespace's reserved `auth-claims-provider`
// serverless function; it MUST be fail-open (return nil, never error) so a
// missing/slow/broken provider never breaks authentication. Injected via
// SetClaimsResolver; nil = no custom claims (every namespace's default).
type ClaimsResolver interface {
ResolveClaims(ctx context.Context, wallet, namespace string) map[string]string
}
// SetClaimsResolver wires the namespace claims-provider hook used at mint time.
func (s *Service) SetClaimsResolver(r ClaimsResolver) { s.claimsResolver = r }
// resolveCustomClaims returns the namespace's additive claims for this wallet,
// or nil. Fail-open by contract — the resolver never errors.
func (s *Service) resolveCustomClaims(ctx context.Context, wallet, namespace string) map[string]string {
if s.claimsResolver == nil {
return nil
}
return s.claimsResolver.ResolveClaims(ctx, wallet, namespace)
}
// ErrRotationNotConfigured is returned by RefreshToken when the service
// wasn't given an rqlite client — refusing to rotate without atomicity
// guarantees is safer than rotating non-atomically.
@ -224,8 +247,13 @@ func (s *Service) IssueTokens(ctx context.Context, wallet, namespace string) (st
return "", "", 0, fmt.Errorf("signing key unavailable")
}
// Resolve namespace-defined additive claims (bugboard #548) ONCE at mint
// time. Stored with the refresh token below and replayed across rotations
// so the 15-min refresh path never re-invokes the provider.
custom := s.resolveCustomClaims(ctx, wallet, namespace)
// Issue access token (15m)
token, expUnix, err := s.GenerateJWT(namespace, wallet, 15*time.Minute)
token, expUnix, err := s.GenerateJWT(namespace, wallet, 15*time.Minute, custom)
if err != nil {
return "", "", 0, fmt.Errorf("failed to generate JWT: %w", err)
}
@ -246,8 +274,8 @@ func (s *Service) IssueTokens(ctx context.Context, wallet, namespace string) (st
db := s.orm.Database()
hashedRefresh := sha256Hex(refresh)
if _, err := db.Query(internalCtx,
"INSERT INTO refresh_tokens(namespace_id, subject, token, audience, expires_at) VALUES (?, ?, ?, ?, datetime('now', '+30 days'))",
nsID, wallet, hashedRefresh, "gateway",
"INSERT INTO refresh_tokens(namespace_id, subject, token, audience, expires_at, custom_claims) VALUES (?, ?, ?, ?, datetime('now', '+30 days'), ?)",
nsID, wallet, hashedRefresh, "gateway", marshalClaims(custom),
); err != nil {
return "", "", 0, fmt.Errorf("failed to store refresh token: %w", err)
}
@ -265,6 +293,27 @@ func (s *Service) IssueTokens(ctx context.Context, wallet, namespace string) (st
// This is the tripwire promised by RFC 9700 §4.12 (refresh-token rotation).
var ErrRefreshTokenReplay = fmt.Errorf("refresh token already rotated or invalid")
// ErrRefreshTransient is returned when refresh-token rotation fails for a
// RETRYABLE reason — an rqlite-layer error rather than a genuine bad/expired
// token. Bugboard #125: during a rolling gateway restart the rqlite leader is
// briefly unavailable (re-election window), so the lookup/rotation errors;
// collapsing that into "invalid token" forces a 401 → full SIWE re-auth, which
// is impossible on a locked device answering a VoIP-woken call. Callers MUST
// surface this as a retryable 503, NOT a 401, so the client retries within the
// ring window instead of tearing down the session.
var ErrRefreshTransient = fmt.Errorf("refresh token rotation temporarily unavailable")
const (
// refreshSelectRetries bounds how many times the refresh lookup is retried
// when the rqlite read errors (transient leader unavailability). The read
// is idempotent and happens BEFORE any write, so retrying is safe.
refreshSelectRetries = 3
// refreshSelectRetryDelay is the backoff between lookup retries. Three
// tries × 250ms rides out a brief leader re-election without adding
// meaningful latency to the common (healthy-leader) path.
refreshSelectRetryDelay = 250 * time.Millisecond
)
// RefreshToken validates the supplied refresh token, atomically rotates it
// (revokes the old, mints a new), and returns a fresh access token alongside
// the rotated refresh token.
@ -309,22 +358,61 @@ func (s *Service) RefreshToken(ctx context.Context, refreshToken, namespace stri
nsID, err := s.ResolveNamespaceID(ctx, namespace)
if err != nil {
return "", "", "", 0, err
// Bugboard #125: namespace resolution runs an rqlite query BEFORE the
// token lookup, so a leader re-election during a rolling restart fails
// here too. Treat it as retryable (→ 503), not a bad token (→ 401) —
// the refresh-path namespace comes from an already-authenticated
// session, so a resolution failure is a transient DB error, never the
// client's fault.
s.logger.ComponentWarn(logging.ComponentGeneral,
"refresh namespace resolution failed (transient, surfacing retryable)",
zap.String("namespace", namespace),
zap.Error(err))
return "", "", "", 0, ErrRefreshTransient
}
hashedRefresh := sha256Hex(refreshToken)
// Step 1: read the subject. Tells us who the token belongs to AND
// validates that it's currently usable (not revoked, not expired).
selectQ := `SELECT subject FROM refresh_tokens
//
// Bugboard #125: distinguish a TRANSIENT rqlite error (leader briefly
// unavailable during a rolling restart) from a GENUINE token miss. The
// read is idempotent and pre-write, so we retry it a few times; only after
// exhausting retries do we surface ErrRefreshTransient (→ 503, client
// retries). An actual empty result (Count == 0) is a real bad/expired
// token → "invalid or expired" (→ 401). Collapsing the two used to 401 a
// valid session during every restart, defeating the VoIP-wake refresh.
selectQ := `SELECT subject, custom_claims FROM refresh_tokens
WHERE namespace_id = ? AND token = ?
AND revoked_at IS NULL
AND (expires_at IS NULL OR expires_at > datetime('now'))
LIMIT 1`
res, err := ormDB.Query(internalCtx, selectQ, nsID, hashedRefresh)
if err != nil || res == nil || res.Count == 0 {
var res *client.QueryResult
var selErr error
for attempt := 0; attempt < refreshSelectRetries; attempt++ {
res, selErr = ormDB.Query(internalCtx, selectQ, nsID, hashedRefresh)
if selErr == nil && res != nil {
break
}
if attempt < refreshSelectRetries-1 {
time.Sleep(refreshSelectRetryDelay)
}
}
if selErr != nil || res == nil {
// rqlite error persisted across retries — leader likely mid-election.
// Retryable, NOT an invalid token.
s.logger.ComponentWarn(logging.ComponentGeneral,
"refresh token lookup failed (transient rqlite error, surfacing retryable)",
zap.String("namespace", namespace),
zap.Error(selErr))
return "", "", "", 0, ErrRefreshTransient
}
if res.Count == 0 {
// Genuinely not found / revoked / expired — a real bad token.
return "", "", "", 0, fmt.Errorf("invalid or expired refresh token")
}
var customClaimsJSON string
if len(res.Rows) > 0 && len(res.Rows[0]) > 0 {
if val, ok := res.Rows[0][0].(string); ok {
subject = val
@ -332,7 +420,15 @@ func (s *Service) RefreshToken(ctx context.Context, refreshToken, namespace stri
b, _ := json.Marshal(res.Rows[0][0])
_ = json.Unmarshal(b, &subject)
}
// custom_claims (bugboard #548) — resolved once at login, replayed on
// every rotation so the refresh path never re-invokes the provider.
if len(res.Rows[0]) > 1 {
if cc, ok := res.Rows[0][1].(string); ok {
customClaimsJSON = cc
}
}
}
custom := unmarshalClaims(customClaimsJSON)
// Step 2: atomic CAS — revoke the old row. RowsAffected is the lock.
// Two concurrent calls with the same refresh token: exactly one wins
@ -343,7 +439,13 @@ func (s *Service) RefreshToken(ctx context.Context, refreshToken, namespace stri
WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL`,
nsID, hashedRefresh)
if err != nil {
return "", "", "", 0, fmt.Errorf("revoke old refresh token: %w", err)
// rqlite write error (leader unavailable) — retryable, not a bad
// token. No row was revoked, so a client retry is safe (bugboard #125).
s.logger.ComponentWarn(logging.ComponentGeneral,
"refresh token revoke failed (transient rqlite error, surfacing retryable)",
zap.String("namespace", namespace),
zap.Error(err))
return "", "", "", 0, ErrRefreshTransient
}
affected, _ := updRes.RowsAffected()
if affected == 0 {
@ -359,8 +461,9 @@ func (s *Service) RefreshToken(ctx context.Context, refreshToken, namespace stri
return "", "", "", 0, ErrRefreshTokenReplay
}
// Step 3: mint the new access JWT.
accessToken, expUnix, err = s.GenerateJWT(namespace, subject, 15*time.Minute)
// Step 3: mint the new access JWT, carrying forward the stored custom
// claims so a rotated token keeps the same account_id etc. (bugboard #548).
accessToken, expUnix, err = s.GenerateJWT(namespace, subject, 15*time.Minute, custom)
if err != nil {
return "", "", "", 0, fmt.Errorf("generate access token: %w", err)
}
@ -376,10 +479,23 @@ func (s *Service) RefreshToken(ctx context.Context, refreshToken, namespace stri
}
newRefreshToken = base64.RawURLEncoding.EncodeToString(rbuf)
hashedNew := sha256Hex(newRefreshToken)
// Re-marshal from the parsed map (not the raw stored string) so the new
// row and the freshly-minted access token are provably consistent and
// self-healing — a malformed stored blob converges to "" on both sides
// rather than being propagated forward verbatim. custom_claims is written
// ONLY here and in IssueTokens, both from a sanitized map (bugboard #548).
if _, err := ormDB.Query(internalCtx,
"INSERT INTO refresh_tokens(namespace_id, subject, token, audience, expires_at) VALUES (?, ?, ?, ?, datetime('now', '+30 days'))",
nsID, subject, hashedNew, "gateway"); err != nil {
return "", "", "", 0, fmt.Errorf("store rotated refresh token: %w", err)
"INSERT INTO refresh_tokens(namespace_id, subject, token, audience, expires_at, custom_claims) VALUES (?, ?, ?, ?, datetime('now', '+30 days'), ?)",
nsID, subject, hashedNew, "gateway", marshalClaims(custom)); err != nil {
// The old token is already revoked (step 2). A retryable error here
// leaves the client to re-attempt — which will re-auth since the old
// token is gone — but that's strictly better than masking a transient
// failure as a permanent 401 (bugboard #125). Surface retryable.
s.logger.ComponentWarn(logging.ComponentGeneral,
"refresh token store failed after revoke (transient rqlite error)",
zap.String("namespace", namespace),
zap.Error(err))
return "", "", "", 0, ErrRefreshTransient
}
return accessToken, newRefreshToken, subject, expUnix, nil

View File

@ -112,7 +112,7 @@ func TestJWTFlow(t *testing.T) {
sub := "0x1234567890abcdef1234567890abcdef12345678"
ttl := 15 * time.Minute
token, exp, err := s.GenerateJWT(ns, sub, ttl)
token, exp, err := s.GenerateJWT(ns, sub, ttl, nil)
if err != nil {
t.Fatalf("GenerateJWT failed: %v", err)
}
@ -192,7 +192,7 @@ func TestEdDSAJWTFlow(t *testing.T) {
ttl := 15 * time.Minute
// With EdDSA preferred, GenerateJWT should produce an EdDSA token
token, exp, err := s.GenerateJWT(ns, sub, ttl)
token, exp, err := s.GenerateJWT(ns, sub, ttl, nil)
if err != nil {
t.Fatalf("GenerateJWT (EdDSA) failed: %v", err)
}
@ -233,7 +233,7 @@ func TestRS256BackwardCompat(t *testing.T) {
// Generate an RS256 token directly (simulating a legacy token)
s.preferEdDSA = false
token, _, err := s.GenerateJWT("test-ns", "user1", 15*time.Minute)
token, _, err := s.GenerateJWT("test-ns", "user1", 15*time.Minute, nil)
if err != nil {
t.Fatalf("GenerateJWT (RS256) failed: %v", err)
}
@ -447,7 +447,7 @@ func TestEdDSACrossServiceVerify(t *testing.T) {
const wantSub = "BNbN2RNQTsYrrywZCLnhV9j3hd38jwcRqfxBecZX7hDE"
const wantNS = "anchat-test"
token, _, err := signer.GenerateJWT(wantNS, wantSub, 15*time.Minute)
token, _, err := signer.GenerateJWT(wantNS, wantSub, 15*time.Minute, nil)
if err != nil {
t.Fatalf("signer.GenerateJWT: %v", err)
}
@ -478,7 +478,7 @@ func TestEdDSACrossServiceVerify_differentKeysFail(t *testing.T) {
_, verKey, _ := ed25519.GenerateKey(rand.Reader)
verifier.SetEdDSAKey(verKey)
token, _, err := signer.GenerateJWT("ns", "sub", 15*time.Minute)
token, _, err := signer.GenerateJWT("ns", "sub", 15*time.Minute, nil)
if err != nil {
t.Fatalf("GenerateJWT: %v", err)
}

View File

@ -0,0 +1,178 @@
package gateway
import (
"context"
"encoding/json"
"errors"
"sort"
"sync"
"time"
"github.com/DeBrosOfficial/network/pkg/serverless"
"github.com/DeBrosOfficial/network/pkg/serverless/registry"
"go.uber.org/zap"
)
// Claims-provider hook (bugboard #548/#920).
//
// A namespace opts into additive, signed JWT claims by deploying a serverless
// function with the RESERVED name "auth-claims-provider". At /v1/auth/verify
// mint time the gateway invokes it (in the namespace's own context, so it can
// read the namespace's tables) with {"wallet","namespace"} and merges the
// string→string object it returns into the JWT's custom claims — e.g.
// {"account_id":"<users.user_id>"} so push devices key on the stable account
// identity rather than the authenticating wallet.
//
// Hard guarantees:
// - FAIL-OPEN: a missing / slow / erroring / malformed provider yields NO
// claims; authentication never breaks because a claims function is down.
// - Reserved claims (sub/iss/aud/iat/nbf/exp/namespace/custom) can never be
// set by the provider — the gateway controls those.
// - Bounded: timeout, max claim count, max total size.
const (
// claimsProviderFnName is the reserved function name a namespace deploys to
// inject additive JWT claims at mint time.
claimsProviderFnName = "auth-claims-provider"
// claimsProviderTimeout bounds the provider invocation so a slow/hung
// function never stalls the auth path past this budget (fail-open after).
claimsProviderTimeout = 2 * time.Second
// maxCustomClaims / maxCustomClaimsBytes cap what a provider may inject —
// JWTs ride in headers, and an unbounded claim blob is a DoS / cost vector.
maxCustomClaims = 16
maxCustomClaimsBytes = 4096
// claimsProviderWarnInterval rate-limits the fail-open WARN so a broken
// provider doesn't flood the log on every login.
claimsProviderWarnInterval = 30 * time.Second
)
// reservedClaimKeys can never be injected by a namespace claims provider; the
// gateway owns these. A provider that returns any of them has them dropped.
var reservedClaimKeys = map[string]struct{}{
"sub": {}, "iss": {}, "aud": {}, "iat": {},
"nbf": {}, "exp": {}, "namespace": {}, "custom": {},
}
// jwtClaimsProvider implements auth.ClaimsResolver by invoking the namespace's
// reserved auth-claims-provider function.
type jwtClaimsProvider struct {
invoker *serverless.Invoker
logger *zap.Logger
mu sync.Mutex
lastWarnUTC time.Time
}
// newJWTClaimsProvider builds the resolver. A nil invoker disables the hook
// (ResolveClaims returns nil).
func newJWTClaimsProvider(invoker *serverless.Invoker, logger *zap.Logger) *jwtClaimsProvider {
if logger == nil {
logger = zap.NewNop()
}
return &jwtClaimsProvider{invoker: invoker, logger: logger.Named("claims-provider")}
}
// ResolveClaims invokes the namespace's auth-claims-provider and returns the
// sanitized additive claims, or nil. Never errors (fail-open contract).
func (p *jwtClaimsProvider) ResolveClaims(ctx context.Context, wallet, namespace string) map[string]string {
if p.invoker == nil || wallet == "" || namespace == "" {
return nil
}
input, err := json.Marshal(map[string]string{"wallet": wallet, "namespace": namespace})
if err != nil {
return nil
}
callCtx, cancel := context.WithTimeout(ctx, claimsProviderTimeout)
defer cancel()
resp, err := p.invoker.Invoke(callCtx, &serverless.InvokeRequest{
Namespace: namespace,
FunctionName: claimsProviderFnName,
Input: input,
// Gateway-initiated, no end-user caller → system trigger skips the
// per-caller authorization check.
TriggerType: serverless.TriggerTypeInternal,
})
if err != nil || resp == nil {
// The namespace simply hasn't deployed the function (registry miss) is
// the normal no-claims case for most namespaces — stay silent. Any
// other failure is a real problem worth a rate-limited WARN.
if !errors.Is(err, registry.ErrFunctionNotFound) {
p.warnRateLimited("claims provider invoke failed (minting without custom claims)",
namespace, err)
}
return nil
}
if resp.Status != serverless.InvocationStatusSuccess {
p.warnRateLimited("claims provider returned non-success (minting without custom claims)",
namespace, nil)
return nil
}
return sanitizeProviderClaims(resp.Output)
}
// sanitizeProviderClaims parses the provider's RAW stdout as a bare JSON object
// of additive claims (NOT an {ok,result} Ack envelope — per the #976 contract)
// and returns a safe string→string subset: string values only, reserved keys
// dropped, bounded count and total size. Any parse failure → nil (fail-open).
func sanitizeProviderClaims(raw []byte) map[string]string {
if len(raw) == 0 || len(raw) > maxCustomClaimsBytes {
return nil
}
var obj map[string]any
if err := json.Unmarshal(raw, &obj); err != nil || len(obj) == 0 {
return nil
}
// Iterate in sorted key order so an over-budget provider payload truncates
// DETERMINISTICALLY (Go map iteration is randomized) — the same output must
// always yield the same claims, never a per-login-varying subset.
keys := make([]string, 0, len(obj))
for k := range obj {
keys = append(keys, k)
}
sort.Strings(keys)
out := make(map[string]string, len(obj))
total := 0
for _, k := range keys {
if len(out) >= maxCustomClaims {
break
}
if _, reserved := reservedClaimKeys[k]; reserved {
continue
}
s, ok := obj[k].(string) // string→string contract; non-string values dropped
if !ok {
continue
}
total += len(k) + len(s)
if total > maxCustomClaimsBytes {
break
}
out[k] = s
}
if len(out) == 0 {
return nil
}
return out
}
func (p *jwtClaimsProvider) warnRateLimited(msg, namespace string, err error) {
p.mu.Lock()
now := time.Now()
if now.Sub(p.lastWarnUTC) < claimsProviderWarnInterval {
p.mu.Unlock()
return
}
p.lastWarnUTC = now
p.mu.Unlock()
fields := []zap.Field{zap.String("namespace", namespace), zap.String("function", claimsProviderFnName)}
if err != nil {
fields = append(fields, zap.Error(err))
}
p.logger.Warn(msg, fields...)
}

View File

@ -0,0 +1,98 @@
package gateway
import (
"testing"
)
// Bugboard #548: the claims-provider sanitizer is the security boundary —
// a namespace function must NOT be able to forge reserved claims, inject
// non-string values, or blow the size budget.
func TestSanitizeProviderClaims_happyPath(t *testing.T) {
out := sanitizeProviderClaims([]byte(`{"account_id":"u-123","tier":"pro"}`))
if out["account_id"] != "u-123" || out["tier"] != "pro" {
t.Fatalf("expected additive claims, got %v", out)
}
}
func TestSanitizeProviderClaims_dropsReservedKeys(t *testing.T) {
// A malicious provider tries to override sub/exp/namespace — must be dropped.
out := sanitizeProviderClaims([]byte(`{"sub":"0xATTACKER","exp":"9999999999","namespace":"evil","account_id":"u-1"}`))
for _, k := range []string{"sub", "exp", "namespace"} {
if _, present := out[k]; present {
t.Errorf("reserved key %q must be dropped, got %v", k, out)
}
}
if out["account_id"] != "u-1" {
t.Errorf("legitimate claim dropped: %v", out)
}
}
func TestSanitizeProviderClaims_nonStringValuesDropped(t *testing.T) {
out := sanitizeProviderClaims([]byte(`{"account_id":"u-1","num":5,"obj":{"a":1},"arr":[1],"ok":"yes"}`))
if len(out) != 2 || out["account_id"] != "u-1" || out["ok"] != "yes" {
t.Errorf("non-string values must be dropped; got %v", out)
}
}
func TestSanitizeProviderClaims_failOpenOnGarbage(t *testing.T) {
for _, bad := range [][]byte{
nil,
[]byte(``),
[]byte(`not json`),
[]byte(`[1,2,3]`), // array, not object
[]byte(`"just a string"`), // scalar
[]byte(`{}`), // empty object
[]byte(`{"ok":true,"result":{"account_id":"u"}}`), // Ack envelope (wrong shape) → no top-level string claims
} {
if got := sanitizeProviderClaims(bad); got != nil {
t.Errorf("garbage %q must yield nil (fail-open), got %v", bad, got)
}
}
}
func TestSanitizeProviderClaims_countAndSizeCapped(t *testing.T) {
// Way more than maxCustomClaims string entries.
buf := []byte("{")
for i := 0; i < maxCustomClaims+20; i++ {
if i > 0 {
buf = append(buf, ',')
}
buf = append(buf, []byte(`"k`)...)
buf = append(buf, []byte(itoa(i))...)
buf = append(buf, []byte(`":"v"`)...)
}
buf = append(buf, '}')
out := sanitizeProviderClaims(buf)
if len(out) > maxCustomClaims {
t.Errorf("claim count not capped: got %d, max %d", len(out), maxCustomClaims)
}
// Oversized total payload → rejected outright.
big := make([]byte, maxCustomClaimsBytes+10)
for i := range big {
big[i] = 'a'
}
if got := sanitizeProviderClaims(big); got != nil {
t.Errorf("oversized payload must be rejected, got %v", got)
}
}
func TestResolveClaims_nilInvokerOrEmptyArgs(t *testing.T) {
p := newJWTClaimsProvider(nil, nil) // nil invoker disables the hook
if got := p.ResolveClaims(nil, "0xW", "ns"); got != nil {
t.Errorf("nil invoker must yield nil claims, got %v", got)
}
}
func itoa(n int) string {
if n == 0 {
return "0"
}
var b []byte
for n > 0 {
b = append([]byte{byte('0' + n%10)}, b...)
n /= 10
}
return string(b)
}

View File

@ -648,6 +648,14 @@ func initializeServerless(logger *logging.ColoredLogger, cfg *Config, deps *Depe
authService.SetRqliteClient(deps.ORMClient)
}
// Wire the namespace claims-provider hook (bugboard #548): at JWT mint time
// the auth service invokes the namespace's reserved `auth-claims-provider`
// function (if deployed) and merges its additive claims (e.g. account_id)
// into the token. Fail-open — a missing/slow provider never breaks auth.
if deps.ServerlessInvoker != nil {
authService.SetClaimsResolver(newJWTClaimsProvider(deps.ServerlessInvoker, logger.Logger))
}
// Load or create EdDSA key for new JWT tokens. Bug #215 fix: when
// cfg.ClusterSecret is set, the key is derived deterministically from
// it via HKDF, so every gateway in the cluster shares the same Ed25519

View File

@ -393,6 +393,32 @@ func TestRefreshHandler_NilAuthService(t *testing.T) {
}
}
// Bugboard #125: a non-bad-token failure (here ErrRotationNotConfigured from a
// service with no rqlite client) must surface as a RETRYABLE 503 with a
// Retry-After header — NOT a 401 that would force a locked device into an
// impossible SIWE re-auth mid-call-ring.
func TestRefreshHandler_TransientError_returns503Retryable(t *testing.T) {
svc, err := authsvc.NewService(testLogger(), nil, "", "default")
if err != nil {
t.Fatalf("failed to create auth service: %v", err)
}
h := NewHandlers(testLogger(), svc, nil, "default", noopInternalAuth)
body, _ := json.Marshal(RefreshRequest{RefreshToken: "some-valid-looking-token"})
req := httptest.NewRequest(http.MethodPost, "/v1/auth/refresh", bytes.NewReader(body))
req.Header.Set("Content-Type", "application/json")
rec := httptest.NewRecorder()
h.RefreshHandler(rec, req)
if rec.Code != http.StatusServiceUnavailable {
t.Fatalf("transient refresh failure must be 503, got %d", rec.Code)
}
if rec.Header().Get("Retry-After") == "" {
t.Error("503 refresh response should carry a Retry-After header")
}
}
// --- APIKeyToJWTHandler tests ---------------------------------------------
func TestAPIKeyToJWTHandler_MissingKey(t *testing.T) {

View File

@ -2,6 +2,7 @@ package auth
import (
"encoding/json"
"errors"
"net/http"
"strings"
"time"
@ -57,7 +58,7 @@ func (h *Handlers) APIKeyToJWTHandler(w http.ResponseWriter, r *http.Request) {
return
}
token, expUnix, err := h.authService.GenerateJWT(ns, key, 15*time.Minute)
token, expUnix, err := h.authService.GenerateJWT(ns, key, 15*time.Minute, nil)
if err != nil {
writeError(w, http.StatusInternalServerError, err.Error())
return
@ -103,11 +104,20 @@ func (h *Handlers) RefreshHandler(w http.ResponseWriter, r *http.Request) {
// the SDK persists it (bug #239 fix) and uses it on the next refresh.
token, newRefreshToken, subject, expUnix, err := h.authService.RefreshToken(r.Context(), req.RefreshToken, req.Namespace)
if err != nil {
// The service emits a WARN log on replay (ErrRefreshTokenReplay)
// so the operator can investigate. We surface a generic 401 here
// regardless — leaking "your token was already used" to the
// caller would help an attacker confirm a stolen token has been
// rotated.
// Bugboard #125: a TRANSIENT rotation failure (rqlite leader briefly
// unavailable during a rolling restart) must surface as a retryable
// 503 — NOT a 401 — so the client retries within the call-ring window
// instead of tearing the session down to a full SIWE re-auth, which is
// impossible on a locked device answering a VoIP-woken call.
if errors.Is(err, authsvc.ErrRefreshTransient) || errors.Is(err, authsvc.ErrRotationNotConfigured) {
w.Header().Set("Retry-After", "1")
writeError(w, http.StatusServiceUnavailable, "refresh temporarily unavailable, retry")
return
}
// Genuine bad/expired/replayed token. The service emits a WARN log on
// replay (ErrRefreshTokenReplay) so the operator can investigate. We
// surface a generic 401 regardless — leaking "your token was already
// used" would help an attacker confirm a stolen token was rotated.
writeError(w, http.StatusUnauthorized, "invalid or expired refresh token")
return
}

View File

@ -23,7 +23,7 @@ func TestJWTGenerateAndParse(t *testing.T) {
t.Fatalf("failed to create service: %v", err)
}
tok, exp, err := svc.GenerateJWT("ns1", "subj", time.Minute)
tok, exp, err := svc.GenerateJWT("ns1", "subj", time.Minute, nil)
if err != nil || exp <= 0 {
t.Fatalf("gen err=%v exp=%d", err, exp)
}
@ -50,7 +50,7 @@ func TestJWTExpired(t *testing.T) {
}
// Use sufficiently negative TTL to bypass allowed clock skew
tok, _, err := svc.GenerateJWT("ns1", "subj", -2*time.Minute)
tok, _, err := svc.GenerateJWT("ns1", "subj", -2*time.Minute, nil)
if err != nil {
t.Fatalf("gen err=%v", err)
}

View File

@ -51,7 +51,7 @@ func newAuthServiceForTest(t *testing.T) *auth.Service {
func TestAuthMiddleware_WSJWTQuery_validToken(t *testing.T) {
svc := newAuthServiceForTest(t)
token, _, err := svc.GenerateJWT("anchat-test", "0xWALLET_SUBJECT", 15*time.Minute)
token, _, err := svc.GenerateJWT("anchat-test", "0xWALLET_SUBJECT", 15*time.Minute, nil)
if err != nil {
t.Fatalf("GenerateJWT: %v", err)
}
@ -125,7 +125,7 @@ func TestAuthMiddleware_WSJWTQuery_ignoredOnNonWSRequest(t *testing.T) {
// privacy issues of JWTs leaking via referrer headers, browser history,
// and access logs.
svc := newAuthServiceForTest(t)
token, _, err := svc.GenerateJWT("ns", "sub", 15*time.Minute)
token, _, err := svc.GenerateJWT("ns", "sub", 15*time.Minute, nil)
if err != nil {
t.Fatalf("GenerateJWT: %v", err)
}
@ -156,8 +156,8 @@ func TestAuthMiddleware_WSJWTQuery_headerWinsOverQuery(t *testing.T) {
// Header path runs FIRST and wins. Verifies the query fallback is a
// fallback, not an override.
svc := newAuthServiceForTest(t)
headerJWT, _, _ := svc.GenerateJWT("ns-header", "sub-header", 15*time.Minute)
queryJWT, _, _ := svc.GenerateJWT("ns-query", "sub-query", 15*time.Minute)
headerJWT, _, _ := svc.GenerateJWT("ns-header", "sub-header", 15*time.Minute, nil)
queryJWT, _, _ := svc.GenerateJWT("ns-query", "sub-query", 15*time.Minute, nil)
g := &Gateway{authService: svc}
@ -242,7 +242,7 @@ func TestAuthMiddleware_WSJWTQuery_malformedJWTFallsThrough(t *testing.T) {
func TestValidateAuthForNamespaceProxy_WSJWTQuery(t *testing.T) {
svc := newAuthServiceForTest(t)
token, _, err := svc.GenerateJWT("anchat-test", "0xWALLET", 15*time.Minute)
token, _, err := svc.GenerateJWT("anchat-test", "0xWALLET", 15*time.Minute, nil)
if err != nil {
t.Fatalf("GenerateJWT: %v", err)
}
@ -270,7 +270,7 @@ func TestValidateAuthForNamespaceProxy_WSJWTQuery(t *testing.T) {
func TestValidateAuthForNamespaceProxy_WSJWTQuery_ignoredOnNonWS(t *testing.T) {
svc := newAuthServiceForTest(t)
token, _, err := svc.GenerateJWT("anchat-test", "0xWALLET", 15*time.Minute)
token, _, err := svc.GenerateJWT("anchat-test", "0xWALLET", 15*time.Minute, nil)
if err != nil {
t.Fatalf("GenerateJWT: %v", err)
}
@ -295,7 +295,7 @@ func TestValidateAuthForNamespaceProxy_WSJWTQuery_ignoredOnNonWS(t *testing.T) {
// doesn't leak into proxy hops or downstream logs.
func TestAuthMiddleware_WSJWTQuery_strippedAfterVerify(t *testing.T) {
svc := newAuthServiceForTest(t)
token, _, err := svc.GenerateJWT("anchat-test", "0xWALLET", 15*time.Minute)
token, _, err := svc.GenerateJWT("anchat-test", "0xWALLET", 15*time.Minute, nil)
if err != nil {
t.Fatalf("GenerateJWT: %v", err)
}

View File

@ -493,7 +493,7 @@ func (i *Invoker) BatchInvoke(ctx context.Context, req *BatchInvokeRequest) (*Ba
func isSystemTrigger(t TriggerType) bool {
switch t {
case TriggerTypeCron, TriggerTypePubSub, TriggerTypeDatabase,
TriggerTypeTimer, TriggerTypeJob:
TriggerTypeTimer, TriggerTypeJob, TriggerTypeInternal:
return true
}
return false

View File

@ -30,6 +30,11 @@ const (
TriggerTypePubSub TriggerType = "pubsub"
TriggerTypeTimer TriggerType = "timer"
TriggerTypeJob TriggerType = "job"
// TriggerTypeInternal marks a gateway-initiated invocation with no end-user
// caller (e.g. the auth claims-provider hook at JWT mint time, bugboard
// #548). Treated as a system trigger so the per-caller authorization check
// is skipped — the gateway is the trusted invoker.
TriggerTypeInternal TriggerType = "internal"
)
// JobStatus represents the current state of a background job.
@ -234,8 +239,8 @@ type FunctionDefinition struct {
// When WSPersistent is true, the function exports ws_open/ws_frame/ws_close
// instead of using the default per-frame stateless model.
WSPersistent bool `json:"ws_persistent,omitempty"`
WSIdleTimeoutSec int `json:"ws_idle_timeout_sec,omitempty"` // 0 = no idle timeout
WSMaxFrameBytes int `json:"ws_max_frame_bytes,omitempty"` // 0 = use default 256 KB
WSIdleTimeoutSec int `json:"ws_idle_timeout_sec,omitempty"` // 0 = no idle timeout
WSMaxFrameBytes int `json:"ws_max_frame_bytes,omitempty"` // 0 = use default 256 KB
WSMaxInflightPerConn int `json:"ws_max_inflight_per_conn,omitempty"` // 0 = use default 64
// RawHTTPResponse enables raw-HTTP-response mode (bugboard #835): the
@ -284,11 +289,11 @@ type Function struct {
// InvocationContext provides context for a function invocation.
type InvocationContext struct {
RequestID string `json:"request_id"`
FunctionID string `json:"function_id"`
FunctionName string `json:"function_name"`
Namespace string `json:"namespace"`
CallerWallet string `json:"caller_wallet,omitempty"`
RequestID string `json:"request_id"`
FunctionID string `json:"function_id"`
FunctionName string `json:"function_name"`
Namespace string `json:"namespace"`
CallerWallet string `json:"caller_wallet,omitempty"`
// CallerIP is the source IP of the request, populated by HTTP/WS handlers.
// Used by the multi-tier rate limiter as a fallback bucket for anonymous
// (no-wallet) callers.

View File

@ -1,6 +1,6 @@
{
"name": "@debros/orama",
"version": "0.122.47",
"version": "0.122.49",
"description": "TypeScript SDK for Orama Network - Database, PubSub, Cache, Storage, Vault, and more",
"type": "module",
"main": "./dist/index.js",