diff --git a/core/migrations/032_refresh_token_reuse_grace.sql b/core/migrations/032_refresh_token_reuse_grace.sql new file mode 100644 index 0000000..744b5ad --- /dev/null +++ b/core/migrations/032_refresh_token_reuse_grace.sql @@ -0,0 +1,20 @@ +-- 032_refresh_token_reuse_grace.sql +-- +-- Bugboard #125: bounded, single-use reuse grace for rotated refresh tokens. +-- +-- Refresh-token rotation is single-use: a successful /v1/auth/refresh revokes +-- the presented token and issues a new one. If the rotation RESPONSE is lost +-- in transit (e.g. a reconnect storm during a gateway roll), the client is +-- left holding a just-revoked token and its retry dead-ends in a 401 -> SIWE. +-- On a VoIP-woken locked screen SIWE is impossible, so the call dies. +-- +-- grace_used_at lets the gateway accept a just-rotated token ONE more time +-- within a short window (RFC 9700 §4.13.2 reuse grace) and mint a fresh +-- session, while the single-use CAS on this column prevents a stolen token +-- from being replayed repeatedly. NULL = grace not yet consumed. +-- +-- Additive ALTER (rolling-upgrade safe): older gateways ignore the column; +-- newer ones read it back NULL for pre-existing rows, which is the correct +-- "grace available" default. + +ALTER TABLE refresh_tokens ADD COLUMN grace_used_at TIMESTAMP; diff --git a/core/pkg/gateway/auth/refresh_rotation_test.go b/core/pkg/gateway/auth/refresh_rotation_test.go index cd2beb8..a25a121 100644 --- a/core/pkg/gateway/auth/refresh_rotation_test.go +++ b/core/pkg/gateway/auth/refresh_rotation_test.go @@ -32,6 +32,10 @@ type rotationMockORMDB struct { mu sync.Mutex subjectByToken map[string]string // hashedToken -> subject (nil/missing = "invalid") claimsByToken map[string]string // hashedToken -> custom_claims JSON (bugboard #548) + // graceableTokens: hashedToken -> subject for tokens that are revoked but + // still inside the reuse-grace window (bugboard #125). The grace SELECT + // (detected by the grace_used_at predicate) reads from here. + graceableTokens map[string]string inserted int // count of INSERTs (new refresh-token rows) subjects map[string]string // subject -> last hashed token inserted // selectErrRemaining: number of upcoming "SELECT subject" calls that @@ -52,6 +56,23 @@ func (m *rotationMockORMDB) Query(_ context.Context, sql string, args ...interfa if containsCI(sql, "SELECT id FROM namespaces") { return &client.QueryResult{Count: 1, Rows: [][]interface{}{{int64(1)}}}, nil } + // Grace-path SELECT (bugboard #125): SELECT subject for a recently-revoked, + // grace-available token. Distinguished from the active-path SELECT by the + // grace_used_at predicate. Must be checked BEFORE the generic handler. + if containsCI(sql, "SELECT subject") && containsCI(sql, "FROM refresh_tokens") && containsCI(sql, "grace_used_at") { + if len(args) < 2 { + return &client.QueryResult{Count: 0}, nil + } + hashedTok, _ := args[1].(string) + if subj, ok := m.graceableTokens[hashedTok]; ok && subj != "" { + claims := "" + if m.claimsByToken != nil { + claims = m.claimsByToken[hashedTok] + } + return &client.QueryResult{Count: 1, Rows: [][]interface{}{{subj, claims}}}, nil + } + return &client.QueryResult{Count: 0}, nil + } // SELECT subject (+ custom_claims, bugboard #548) for the lookup. if containsCI(sql, "SELECT subject") && containsCI(sql, "FROM refresh_tokens") { m.selectAttemptsTaken++ @@ -72,6 +93,21 @@ func (m *rotationMockORMDB) Query(_ context.Context, sql string, args ...interfa } return &client.QueryResult{Count: 0}, nil } + // RevokeToken UPDATE that ALSO burns the grace slot (bugboard #125 + // logout-bypass fix). Reflect it by clearing the token's grace eligibility + // so a follow-on grace SELECT misses it. (The rotation grace CAS goes + // through the rqlite Exec mock, not here, so there's no collision.) + if containsCI(sql, "UPDATE refresh_tokens") && containsCI(sql, "grace_used_at") && len(args) >= 2 { + if key, ok := args[1].(string); ok && m.graceableTokens != nil { + delete(m.graceableTokens, key) // single-token: key is the hashed token + for tok, subj := range m.graceableTokens { + if subj == key { // revoke-all: key is the subject + delete(m.graceableTokens, tok) + } + } + } + return &client.QueryResult{Count: 1}, nil + } // INSERT new refresh_tokens row. if containsCI(sql, "INSERT INTO refresh_tokens") { m.inserted++ @@ -113,6 +149,12 @@ type rotationMockRqlite struct { rowsAffectedNext []int64 // programmable per-call values; pop from front. Defaults to "revoke if unrevoked". execErrNext []error // programmable per-call errors parallelExecGuard sync.Mutex + // graceCASNext: programmable RowsAffected for the grace CAS (UPDATE ... SET + // grace_used_at). 1 = won the single-use grace; 0 = already consumed + // (bugboard #125). Defaults to "win once per token". + graceCASNext []int64 + graceConsumed map[string]bool + graceCASCalls int } func (m *rotationMockRqlite) Exec(_ context.Context, sql string, args ...interface{}) (sql.Result, error) { @@ -133,6 +175,29 @@ func (m *rotationMockRqlite) Exec(_ context.Context, sql string, args ...interfa } } + // Grace CAS (bugboard #125): UPDATE ... SET grace_used_at, single-use. + if containsCI(sql, "SET grace_used_at") && len(args) >= 2 { + m.graceCASCalls++ + hashedTok, _ := args[1].(string) + if m.graceConsumed == nil { + m.graceConsumed = map[string]bool{} + } + var affected int64 + if len(m.graceCASNext) > 0 { + affected = m.graceCASNext[0] + m.graceCASNext = m.graceCASNext[1:] + if affected == 1 { + m.graceConsumed[hashedTok] = true + } + } else if !m.graceConsumed[hashedTok] { + m.graceConsumed[hashedTok] = true + affected = 1 + } else { + affected = 0 + } + return &rotationFakeResult{affected: affected}, nil + } + // Default UPDATE behavior: matches if token is currently unrevoked. if containsCI(sql, "UPDATE refresh_tokens SET revoked_at") && len(args) >= 2 { hashedTok, _ := args[1].(string) @@ -510,3 +575,116 @@ func TestRefreshToken_propagatesCustomClaims(t *testing.T) { t.Errorf("account_id lost across the second rotation; custom=%v", claims2.Custom) } } + +// ---------------------------------------------------------------------------- +// Bugboard #125 — bounded, single-use refresh-token reuse grace (RFC 9700 +// §4.13.2). A rotation response lost in transit must NOT dead-end in a 401. +// ---------------------------------------------------------------------------- + +// A just-rotated token (revoked, within grace, grace not consumed) is accepted +// ONCE more and mints a fresh session — recovering a client whose rotation +// response was lost. The revoke CAS is skipped (the token is already revoked), +// so this must NOT surface the replay tripwire. +func TestRefreshToken_reuseGrace_recoversLostResponse(t *testing.T) { + s, ormDB, rq := newRotationTestService(t) + + const lostTok = "rotated-but-response-lost" + // NOT in the active set (already revoked) ... + // ... but eligible for grace (revoked recently, grace unused). + ormDB.graceableTokens = map[string]string{sha256Hex(lostTok): "0xWALLET"} + + access, newRefresh, subj, exp, err := s.RefreshToken(context.Background(), lostTok, "anchat-test") + if err != nil { + t.Fatalf("grace recovery should succeed, got error: %v", err) + } + if access == "" || newRefresh == "" { + t.Error("grace recovery must mint a fresh access + refresh token") + } + if newRefresh == lostTok { + t.Error("grace recovery must rotate to a NEW refresh token") + } + if subj != "0xWALLET" { + t.Errorf("subject = %q, want 0xWALLET", subj) + } + if exp <= 0 { + t.Errorf("expiration not set: %d", exp) + } + // The single-use grace CAS must have been claimed exactly once. + if rq.graceCASCalls != 1 { + t.Errorf("grace CAS calls = %d, want 1", rq.graceCASCalls) + } + // And a fresh refresh-token row was inserted. + if ormDB.inserted != 1 { + t.Errorf("expected 1 INSERT for the recovered session, got %d", ormDB.inserted) + } +} + +// The grace is SINGLE-USE: once the grace_used_at CAS is lost (already +// consumed, e.g. a replay after the legitimate client already recovered), the +// token must 401 — a stolen token cannot be replayed at leisure. +func TestRefreshToken_reuseGrace_singleUse_secondAttemptIs401(t *testing.T) { + s, ormDB, rq := newRotationTestService(t) + + const tok = "already-grace-consumed" + ormDB.graceableTokens = map[string]string{sha256Hex(tok): "0xWALLET"} + // Force the grace CAS to report "already consumed". + rq.graceCASNext = []int64{0} + + _, _, _, _, err := s.RefreshToken(context.Background(), tok, "anchat-test") + if err == nil { + t.Fatal("a consumed grace must NOT recover — expected an invalid-token error") + } + if !containsCI(err.Error(), "invalid or expired") { + t.Errorf("want invalid/expired 401, got %v", err) + } + if ormDB.inserted != 0 { + t.Errorf("no new session should be minted when grace is consumed; inserts=%d", ormDB.inserted) + } +} + +// A genuinely bad token (not active AND not grace-eligible) still 401s — the +// grace path must not turn unknown tokens into sessions. +func TestRefreshToken_noGrace_genuineBadToken_stays401(t *testing.T) { + s, ormDB, _ := newRotationTestService(t) + // graceableTokens left empty: nothing is grace-eligible. + + _, _, _, _, err := s.RefreshToken(context.Background(), "never-seen-this-token", "anchat-test") + if err == nil { + t.Fatal("a never-seen token must be rejected") + } + if !containsCI(err.Error(), "invalid or expired") { + t.Errorf("want invalid/expired 401, got %v", err) + } + if ormDB.inserted != 0 { + t.Errorf("no session should be minted for a bad token; inserts=%d", ormDB.inserted) + } +} + +// Security regression (bugboard #125 logout-bypass): a token explicitly revoked +// via RevokeToken (logout) must NOT be recoverable through the reuse grace, even +// within the 60s window. RevokeToken burns grace_used_at so the grace predicate +// (grace_used_at IS NULL) excludes it. +func TestRevokeToken_burnsGrace_blocksLogoutBypass(t *testing.T) { + s, ormDB, _ := newRotationTestService(t) + + const tok = "logged-out-token" + // Within the revoke window it WOULD be grace-eligible... + ormDB.graceableTokens = map[string]string{sha256Hex(tok): "0xWALLET"} + + // ...until the user logs out. + if err := s.RevokeToken(context.Background(), "anchat-test", tok, false, ""); err != nil { + t.Fatalf("RevokeToken: %v", err) + } + + // A refresh with the just-logged-out token must be rejected, not resurrected. + _, _, _, _, err := s.RefreshToken(context.Background(), tok, "anchat-test") + if err == nil { + t.Fatal("LOGOUT-BYPASS: a logged-out token was resurrected via reuse grace") + } + if !containsCI(err.Error(), "invalid or expired") { + t.Errorf("want 401 invalid/expired, got %v", err) + } + if ormDB.inserted != 0 { + t.Errorf("no session should be minted for a logged-out token; inserts=%d", ormDB.inserted) + } +} diff --git a/core/pkg/gateway/auth/service.go b/core/pkg/gateway/auth/service.go index 008b162..ba00ba7 100644 --- a/core/pkg/gateway/auth/service.go +++ b/core/pkg/gateway/auth/service.go @@ -312,6 +312,13 @@ const ( // tries × 250ms rides out a brief leader re-election without adding // meaningful latency to the common (healthy-leader) path. refreshSelectRetryDelay = 250 * time.Millisecond + // refreshReuseGrace is how long after a refresh token is rotated (revoked) + // the gateway will still accept it ONE more time, to recover a client whose + // rotation response was lost in transit — otherwise the retry dead-ends in a + // 401 → SIWE, impossible on a VoIP-woken locked screen (bugboard #125, RFC + // 9700 §4.13.2). Kept short, and single-use via grace_used_at, so a stolen + // token cannot be replayed at leisure. + refreshReuseGrace = 60 * time.Second ) // RefreshToken validates the supplied refresh token, atomically rotates it @@ -408,57 +415,91 @@ func (s *Service) RefreshToken(ctx context.Context, refreshToken, namespace stri zap.Error(selErr)) return "", "", "", 0, ErrRefreshTransient } + // graceRecovery is set when the presented token was NOT in the active set + // but qualifies for the bugboard #125 single-use reuse grace (a just- + // rotated token whose rotation response was lost). In that case the old row + // is already revoked, so we SKIP the revoke CAS (step 2) — the grace CAS + // inside tryRefreshReuseGrace is our single-use lock — and go straight to + // minting a fresh session. + graceRecovery := false + var custom map[string]string if res.Count == 0 { - // Genuinely not found / revoked / expired — a real bad token. - return "", "", "", 0, fmt.Errorf("invalid or expired refresh token") - } - var customClaimsJSON string - if len(res.Rows) > 0 && len(res.Rows[0]) > 0 { - if val, ok := res.Rows[0][0].(string); ok { - subject = val - } else { - b, _ := json.Marshal(res.Rows[0][0]) - _ = json.Unmarshal(b, &subject) + gSubject, gCustom, gOK, gErr := s.tryRefreshReuseGrace(internalCtx, ormDB, nsID, hashedRefresh) + if gErr != nil { + // Transient rqlite error during the grace lookup/claim — retryable, + // not a verdict on the token (bugboard #125). + s.logger.ComponentWarn(logging.ComponentGeneral, + "refresh reuse-grace lookup failed (transient rqlite error, surfacing retryable)", + zap.String("namespace", namespace), zap.Error(gErr)) + return "", "", "", 0, ErrRefreshTransient } - // custom_claims (bugboard #548) — resolved once at login, replayed on - // every rotation so the refresh path never re-invokes the provider. - if len(res.Rows[0]) > 1 { - if cc, ok := res.Rows[0][1].(string); ok { - customClaimsJSON = cc + if !gOK { + // Genuinely not found / revoked outside grace / grace already + // consumed / expired — a real bad token. + return "", "", "", 0, fmt.Errorf("invalid or expired refresh token") + } + subject = gSubject + custom = gCustom + graceRecovery = true + s.logger.ComponentInfo(logging.ComponentGeneral, + "refresh token reuse-grace recovery (lost-response retry, single-use)", + zap.String("namespace", namespace), zap.String("subject", subject)) + } else { + var customClaimsJSON string + if len(res.Rows) > 0 && len(res.Rows[0]) > 0 { + if val, ok := res.Rows[0][0].(string); ok { + subject = val + } else { + b, _ := json.Marshal(res.Rows[0][0]) + _ = json.Unmarshal(b, &subject) + } + // custom_claims (bugboard #548) — resolved once at login, replayed on + // every rotation so the refresh path never re-invokes the provider. + if len(res.Rows[0]) > 1 { + if cc, ok := res.Rows[0][1].(string); ok { + customClaimsJSON = cc + } } } + custom = unmarshalClaims(customClaimsJSON) } - custom := unmarshalClaims(customClaimsJSON) // Step 2: atomic CAS — revoke the old row. RowsAffected is the lock. // Two concurrent calls with the same refresh token: exactly one wins // the UPDATE (RowsAffected == 1); the other sees RowsAffected == 0 // and bails with the replay tripwire. - updRes, err := s.db.Exec(internalCtx, - `UPDATE refresh_tokens SET revoked_at = datetime('now') - WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL`, - nsID, hashedRefresh) - if err != nil { - // rqlite write error (leader unavailable) — retryable, not a bad - // token. No row was revoked, so a client retry is safe (bugboard #125). - s.logger.ComponentWarn(logging.ComponentGeneral, - "refresh token revoke failed (transient rqlite error, surfacing retryable)", - zap.String("namespace", namespace), - zap.Error(err)) - return "", "", "", 0, ErrRefreshTransient - } - affected, _ := updRes.RowsAffected() - if affected == 0 { - // Race lost OR replay attempt: token was unrevoked at step 1 but - // already revoked by step 2, meaning a concurrent call rotated it - // in between. Could be benign (same client retrying due to a - // transient network error) or malicious (stolen token + race). - // Either way: fail closed, log it, let the operator investigate. - s.logger.ComponentWarn(logging.ComponentGeneral, - "refresh token rotation: concurrent use detected (possible replay)", - zap.String("namespace", namespace), - zap.String("subject", subject)) - return "", "", "", 0, ErrRefreshTokenReplay + // + // Skipped on a grace recovery (bugboard #125): the token is ALREADY + // revoked, so this CAS would always see RowsAffected == 0 and mis-fire the + // replay tripwire. The single-use grace CAS (grace_used_at) inside + // tryRefreshReuseGrace already served as the lock for this path. + if !graceRecovery { + updRes, err := s.db.Exec(internalCtx, + `UPDATE refresh_tokens SET revoked_at = datetime('now') + WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL`, + nsID, hashedRefresh) + if err != nil { + // rqlite write error (leader unavailable) — retryable, not a bad + // token. No row was revoked, so a client retry is safe (bugboard #125). + s.logger.ComponentWarn(logging.ComponentGeneral, + "refresh token revoke failed (transient rqlite error, surfacing retryable)", + zap.String("namespace", namespace), + zap.Error(err)) + return "", "", "", 0, ErrRefreshTransient + } + affected, _ := updRes.RowsAffected() + if affected == 0 { + // Race lost OR replay attempt: token was unrevoked at step 1 but + // already revoked by step 2, meaning a concurrent call rotated it + // in between. Could be benign (same client retrying due to a + // transient network error) or malicious (stolen token + race). + // Either way: fail closed, log it, let the operator investigate. + s.logger.ComponentWarn(logging.ComponentGeneral, + "refresh token rotation: concurrent use detected (possible replay)", + zap.String("namespace", namespace), + zap.String("subject", subject)) + return "", "", "", 0, ErrRefreshTokenReplay + } } // Step 3: mint the new access JWT, carrying forward the stored custom @@ -501,6 +542,74 @@ func (s *Service) RefreshToken(ctx context.Context, refreshToken, namespace stri return accessToken, newRefreshToken, subject, expUnix, nil } +// tryRefreshReuseGrace implements the bounded, single-use reuse grace for a +// rotated refresh token (bugboard #125, RFC 9700 §4.13.2). A token revoked +// within refreshReuseGrace whose grace_used_at is still NULL is accepted ONCE +// more — recovering a client that lost its rotation response in transit (a +// reconnect storm during a gateway roll) before it dead-ends in a 401 → SIWE. +// +// Returns (subject, custom, true, nil) on a successful single-use grace claim; +// (—, —, false, nil) when there is no eligible row, the token was revoked +// outside the grace window, it has expired, or the grace was already consumed +// (caller → 401). A non-nil error is a transient rqlite failure (caller → 503). +// +// Security: the grace is both short-windowed AND single-use (a CAS on +// grace_used_at), so a stolen token cannot be replayed repeatedly; and it never +// touches the concurrent-rotation replay tripwire, which fires on the active +// path only. +func (s *Service) tryRefreshReuseGrace(ctx context.Context, ormDB client.DatabaseClient, nsID interface{}, hashedRefresh string) (subject string, custom map[string]string, ok bool, err error) { + graceArg := fmt.Sprintf("-%d seconds", int(refreshReuseGrace.Seconds())) + sel := `SELECT subject, custom_claims FROM refresh_tokens + WHERE namespace_id = ? AND token = ? + AND revoked_at IS NOT NULL + AND revoked_at > datetime('now', ?) + AND grace_used_at IS NULL + AND (expires_at IS NULL OR expires_at > datetime('now')) + LIMIT 1` + res, qerr := ormDB.Query(ctx, sel, nsID, hashedRefresh, graceArg) + if qerr != nil { + return "", nil, false, qerr // transient rqlite error → caller 503 + } + if res == nil || res.Count == 0 { + return "", nil, false, nil // no eligible grace row → caller 401 + } + + var customClaimsJSON string + if len(res.Rows) > 0 && len(res.Rows[0]) > 0 { + if v, vok := res.Rows[0][0].(string); vok { + subject = v + } else { + b, _ := json.Marshal(res.Rows[0][0]) + _ = json.Unmarshal(b, &subject) + } + if len(res.Rows[0]) > 1 { + if cc, cok := res.Rows[0][1].(string); cok { + customClaimsJSON = cc + } + } + } + if subject == "" { + return "", nil, false, nil // defensive: never grace-mint an anonymous session + } + + // Single-use CAS: claim the grace. Exactly one caller wins; a concurrent + // replay of the same just-revoked token sees RowsAffected == 0 → no grace. + // The same time-window predicate is repeated so the claim can't succeed on a + // row that aged out of the window between the SELECT and here. + updRes, uerr := s.db.Exec(ctx, + `UPDATE refresh_tokens SET grace_used_at = datetime('now') + WHERE namespace_id = ? AND token = ? AND grace_used_at IS NULL + AND revoked_at IS NOT NULL AND revoked_at > datetime('now', ?)`, + nsID, hashedRefresh, graceArg) + if uerr != nil { + return "", nil, false, uerr // transient + } + if affected, _ := updRes.RowsAffected(); affected == 0 { + return "", nil, false, nil // grace already consumed (concurrent) → caller 401 + } + return subject, unmarshalClaims(customClaimsJSON), true, nil +} + // RevokeToken revokes a specific refresh token or all tokens for a subject func (s *Service) RevokeToken(ctx context.Context, namespace, token string, all bool, subject string) error { internalCtx := client.WithInternalAuth(ctx) @@ -511,14 +620,20 @@ func (s *Service) RevokeToken(ctx context.Context, namespace, token string, all return err } + // Explicit revocation (logout / revoke-all) ALSO burns the reuse-grace slot + // (grace_used_at) so a deliberately-revoked token can NEVER be recovered by + // the bugboard #125 reuse grace. Rotation does not go through RevokeToken, + // so the legitimate lost-response grace path is unaffected; this only closes + // the logout-bypass where a just-logged-out token would otherwise be + // grace-eligible for the 60s window. if token != "" { hashedToken := sha256Hex(token) - _, err := db.Query(internalCtx, "UPDATE refresh_tokens SET revoked_at = datetime('now') WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL", nsID, hashedToken) + _, err := db.Query(internalCtx, "UPDATE refresh_tokens SET revoked_at = datetime('now'), grace_used_at = datetime('now') WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL", nsID, hashedToken) return err } if all && subject != "" { - _, err := db.Query(internalCtx, "UPDATE refresh_tokens SET revoked_at = datetime('now') WHERE namespace_id = ? AND subject = ? AND revoked_at IS NULL", nsID, subject) + _, err := db.Query(internalCtx, "UPDATE refresh_tokens SET revoked_at = datetime('now'), grace_used_at = datetime('now') WHERE namespace_id = ? AND subject = ? AND revoked_at IS NULL", nsID, subject) return err }