fix(auth): bounded single-use refresh-token reuse grace (#125)

A lost rotation response strands the client on a just-revoked token: the retry
hits res.Count==0 → genuine 401 → SIWE, which is impossible on a VoIP-woken
locked screen, so the call dies. This recurred under the reconnect storms from
today's gateway rolls.

Add an RFC 9700 §4.13.2 reuse grace: a refresh token revoked within 60s whose
grace_used_at is still NULL is accepted ONCE more and mints a fresh session.
The grace path skips the revoke CAS (the token is already revoked — the CAS
would 0-match and mis-fire the replay tripwire) and is locked instead by a
single-use CAS on grace_used_at, so a stolen token can't be replayed at
leisure. The window predicate is repeated on the CAS to close the
SELECT→UPDATE TOCTOU, and the grace SELECT excludes expired tokens.

Security (found + fixed in review): explicit revocation (RevokeToken /
/v1/auth/logout) now also stamps grace_used_at, so a deliberately-logged-out
token can never be grace-recovered — closes a logout-bypass where a just-
revoked token would otherwise be resurrectable for 60s. Transient rqlite
errors on the grace lookup/CAS surface as 503 (retryable), not 401, preserving
the #125 transient-vs-genuine distinction.

Migration 032 adds grace_used_at (additive ALTER, rolling-safe; NULL = grace
available, the window predicate keeps historically-revoked tokens ineligible).

Dual-reviewed: code-quality APPROVED; security SECURE after the logout-bypass
fix. Tests: lost-response recovery, single-use second-attempt 401, genuine bad
token 401, and the logout-bypass regression.
This commit is contained in:
anonpenguin23 2026-06-12 17:42:36 +03:00
parent f3145f1b90
commit 33600092a8
3 changed files with 356 additions and 43 deletions

View File

@ -0,0 +1,20 @@
-- 032_refresh_token_reuse_grace.sql
--
-- Bugboard #125: bounded, single-use reuse grace for rotated refresh tokens.
--
-- Refresh-token rotation is single-use: a successful /v1/auth/refresh revokes
-- the presented token and issues a new one. If the rotation RESPONSE is lost
-- in transit (e.g. a reconnect storm during a gateway roll), the client is
-- left holding a just-revoked token and its retry dead-ends in a 401 -> SIWE.
-- On a VoIP-woken locked screen SIWE is impossible, so the call dies.
--
-- grace_used_at lets the gateway accept a just-rotated token ONE more time
-- within a short window (RFC 9700 §4.13.2 reuse grace) and mint a fresh
-- session, while the single-use CAS on this column prevents a stolen token
-- from being replayed repeatedly. NULL = grace not yet consumed.
--
-- Additive ALTER (rolling-upgrade safe): older gateways ignore the column;
-- newer ones read it back NULL for pre-existing rows, which is the correct
-- "grace available" default.
ALTER TABLE refresh_tokens ADD COLUMN grace_used_at TIMESTAMP;

View File

@ -32,6 +32,10 @@ type rotationMockORMDB struct {
mu sync.Mutex
subjectByToken map[string]string // hashedToken -> subject (nil/missing = "invalid")
claimsByToken map[string]string // hashedToken -> custom_claims JSON (bugboard #548)
// graceableTokens: hashedToken -> subject for tokens that are revoked but
// still inside the reuse-grace window (bugboard #125). The grace SELECT
// (detected by the grace_used_at predicate) reads from here.
graceableTokens map[string]string
inserted int // count of INSERTs (new refresh-token rows)
subjects map[string]string // subject -> last hashed token inserted
// selectErrRemaining: number of upcoming "SELECT subject" calls that
@ -52,6 +56,23 @@ func (m *rotationMockORMDB) Query(_ context.Context, sql string, args ...interfa
if containsCI(sql, "SELECT id FROM namespaces") {
return &client.QueryResult{Count: 1, Rows: [][]interface{}{{int64(1)}}}, nil
}
// Grace-path SELECT (bugboard #125): SELECT subject for a recently-revoked,
// grace-available token. Distinguished from the active-path SELECT by the
// grace_used_at predicate. Must be checked BEFORE the generic handler.
if containsCI(sql, "SELECT subject") && containsCI(sql, "FROM refresh_tokens") && containsCI(sql, "grace_used_at") {
if len(args) < 2 {
return &client.QueryResult{Count: 0}, nil
}
hashedTok, _ := args[1].(string)
if subj, ok := m.graceableTokens[hashedTok]; ok && subj != "" {
claims := ""
if m.claimsByToken != nil {
claims = m.claimsByToken[hashedTok]
}
return &client.QueryResult{Count: 1, Rows: [][]interface{}{{subj, claims}}}, nil
}
return &client.QueryResult{Count: 0}, nil
}
// SELECT subject (+ custom_claims, bugboard #548) for the lookup.
if containsCI(sql, "SELECT subject") && containsCI(sql, "FROM refresh_tokens") {
m.selectAttemptsTaken++
@ -72,6 +93,21 @@ func (m *rotationMockORMDB) Query(_ context.Context, sql string, args ...interfa
}
return &client.QueryResult{Count: 0}, nil
}
// RevokeToken UPDATE that ALSO burns the grace slot (bugboard #125
// logout-bypass fix). Reflect it by clearing the token's grace eligibility
// so a follow-on grace SELECT misses it. (The rotation grace CAS goes
// through the rqlite Exec mock, not here, so there's no collision.)
if containsCI(sql, "UPDATE refresh_tokens") && containsCI(sql, "grace_used_at") && len(args) >= 2 {
if key, ok := args[1].(string); ok && m.graceableTokens != nil {
delete(m.graceableTokens, key) // single-token: key is the hashed token
for tok, subj := range m.graceableTokens {
if subj == key { // revoke-all: key is the subject
delete(m.graceableTokens, tok)
}
}
}
return &client.QueryResult{Count: 1}, nil
}
// INSERT new refresh_tokens row.
if containsCI(sql, "INSERT INTO refresh_tokens") {
m.inserted++
@ -113,6 +149,12 @@ type rotationMockRqlite struct {
rowsAffectedNext []int64 // programmable per-call values; pop from front. Defaults to "revoke if unrevoked".
execErrNext []error // programmable per-call errors
parallelExecGuard sync.Mutex
// graceCASNext: programmable RowsAffected for the grace CAS (UPDATE ... SET
// grace_used_at). 1 = won the single-use grace; 0 = already consumed
// (bugboard #125). Defaults to "win once per token".
graceCASNext []int64
graceConsumed map[string]bool
graceCASCalls int
}
func (m *rotationMockRqlite) Exec(_ context.Context, sql string, args ...interface{}) (sql.Result, error) {
@ -133,6 +175,29 @@ func (m *rotationMockRqlite) Exec(_ context.Context, sql string, args ...interfa
}
}
// Grace CAS (bugboard #125): UPDATE ... SET grace_used_at, single-use.
if containsCI(sql, "SET grace_used_at") && len(args) >= 2 {
m.graceCASCalls++
hashedTok, _ := args[1].(string)
if m.graceConsumed == nil {
m.graceConsumed = map[string]bool{}
}
var affected int64
if len(m.graceCASNext) > 0 {
affected = m.graceCASNext[0]
m.graceCASNext = m.graceCASNext[1:]
if affected == 1 {
m.graceConsumed[hashedTok] = true
}
} else if !m.graceConsumed[hashedTok] {
m.graceConsumed[hashedTok] = true
affected = 1
} else {
affected = 0
}
return &rotationFakeResult{affected: affected}, nil
}
// Default UPDATE behavior: matches if token is currently unrevoked.
if containsCI(sql, "UPDATE refresh_tokens SET revoked_at") && len(args) >= 2 {
hashedTok, _ := args[1].(string)
@ -510,3 +575,116 @@ func TestRefreshToken_propagatesCustomClaims(t *testing.T) {
t.Errorf("account_id lost across the second rotation; custom=%v", claims2.Custom)
}
}
// ----------------------------------------------------------------------------
// Bugboard #125 — bounded, single-use refresh-token reuse grace (RFC 9700
// §4.13.2). A rotation response lost in transit must NOT dead-end in a 401.
// ----------------------------------------------------------------------------
// A just-rotated token (revoked, within grace, grace not consumed) is accepted
// ONCE more and mints a fresh session — recovering a client whose rotation
// response was lost. The revoke CAS is skipped (the token is already revoked),
// so this must NOT surface the replay tripwire.
func TestRefreshToken_reuseGrace_recoversLostResponse(t *testing.T) {
s, ormDB, rq := newRotationTestService(t)
const lostTok = "rotated-but-response-lost"
// NOT in the active set (already revoked) ...
// ... but eligible for grace (revoked recently, grace unused).
ormDB.graceableTokens = map[string]string{sha256Hex(lostTok): "0xWALLET"}
access, newRefresh, subj, exp, err := s.RefreshToken(context.Background(), lostTok, "anchat-test")
if err != nil {
t.Fatalf("grace recovery should succeed, got error: %v", err)
}
if access == "" || newRefresh == "" {
t.Error("grace recovery must mint a fresh access + refresh token")
}
if newRefresh == lostTok {
t.Error("grace recovery must rotate to a NEW refresh token")
}
if subj != "0xWALLET" {
t.Errorf("subject = %q, want 0xWALLET", subj)
}
if exp <= 0 {
t.Errorf("expiration not set: %d", exp)
}
// The single-use grace CAS must have been claimed exactly once.
if rq.graceCASCalls != 1 {
t.Errorf("grace CAS calls = %d, want 1", rq.graceCASCalls)
}
// And a fresh refresh-token row was inserted.
if ormDB.inserted != 1 {
t.Errorf("expected 1 INSERT for the recovered session, got %d", ormDB.inserted)
}
}
// The grace is SINGLE-USE: once the grace_used_at CAS is lost (already
// consumed, e.g. a replay after the legitimate client already recovered), the
// token must 401 — a stolen token cannot be replayed at leisure.
func TestRefreshToken_reuseGrace_singleUse_secondAttemptIs401(t *testing.T) {
s, ormDB, rq := newRotationTestService(t)
const tok = "already-grace-consumed"
ormDB.graceableTokens = map[string]string{sha256Hex(tok): "0xWALLET"}
// Force the grace CAS to report "already consumed".
rq.graceCASNext = []int64{0}
_, _, _, _, err := s.RefreshToken(context.Background(), tok, "anchat-test")
if err == nil {
t.Fatal("a consumed grace must NOT recover — expected an invalid-token error")
}
if !containsCI(err.Error(), "invalid or expired") {
t.Errorf("want invalid/expired 401, got %v", err)
}
if ormDB.inserted != 0 {
t.Errorf("no new session should be minted when grace is consumed; inserts=%d", ormDB.inserted)
}
}
// A genuinely bad token (not active AND not grace-eligible) still 401s — the
// grace path must not turn unknown tokens into sessions.
func TestRefreshToken_noGrace_genuineBadToken_stays401(t *testing.T) {
s, ormDB, _ := newRotationTestService(t)
// graceableTokens left empty: nothing is grace-eligible.
_, _, _, _, err := s.RefreshToken(context.Background(), "never-seen-this-token", "anchat-test")
if err == nil {
t.Fatal("a never-seen token must be rejected")
}
if !containsCI(err.Error(), "invalid or expired") {
t.Errorf("want invalid/expired 401, got %v", err)
}
if ormDB.inserted != 0 {
t.Errorf("no session should be minted for a bad token; inserts=%d", ormDB.inserted)
}
}
// Security regression (bugboard #125 logout-bypass): a token explicitly revoked
// via RevokeToken (logout) must NOT be recoverable through the reuse grace, even
// within the 60s window. RevokeToken burns grace_used_at so the grace predicate
// (grace_used_at IS NULL) excludes it.
func TestRevokeToken_burnsGrace_blocksLogoutBypass(t *testing.T) {
s, ormDB, _ := newRotationTestService(t)
const tok = "logged-out-token"
// Within the revoke window it WOULD be grace-eligible...
ormDB.graceableTokens = map[string]string{sha256Hex(tok): "0xWALLET"}
// ...until the user logs out.
if err := s.RevokeToken(context.Background(), "anchat-test", tok, false, ""); err != nil {
t.Fatalf("RevokeToken: %v", err)
}
// A refresh with the just-logged-out token must be rejected, not resurrected.
_, _, _, _, err := s.RefreshToken(context.Background(), tok, "anchat-test")
if err == nil {
t.Fatal("LOGOUT-BYPASS: a logged-out token was resurrected via reuse grace")
}
if !containsCI(err.Error(), "invalid or expired") {
t.Errorf("want 401 invalid/expired, got %v", err)
}
if ormDB.inserted != 0 {
t.Errorf("no session should be minted for a logged-out token; inserts=%d", ormDB.inserted)
}
}

View File

@ -312,6 +312,13 @@ const (
// tries × 250ms rides out a brief leader re-election without adding
// meaningful latency to the common (healthy-leader) path.
refreshSelectRetryDelay = 250 * time.Millisecond
// refreshReuseGrace is how long after a refresh token is rotated (revoked)
// the gateway will still accept it ONE more time, to recover a client whose
// rotation response was lost in transit — otherwise the retry dead-ends in a
// 401 → SIWE, impossible on a VoIP-woken locked screen (bugboard #125, RFC
// 9700 §4.13.2). Kept short, and single-use via grace_used_at, so a stolen
// token cannot be replayed at leisure.
refreshReuseGrace = 60 * time.Second
)
// RefreshToken validates the supplied refresh token, atomically rotates it
@ -408,57 +415,91 @@ func (s *Service) RefreshToken(ctx context.Context, refreshToken, namespace stri
zap.Error(selErr))
return "", "", "", 0, ErrRefreshTransient
}
// graceRecovery is set when the presented token was NOT in the active set
// but qualifies for the bugboard #125 single-use reuse grace (a just-
// rotated token whose rotation response was lost). In that case the old row
// is already revoked, so we SKIP the revoke CAS (step 2) — the grace CAS
// inside tryRefreshReuseGrace is our single-use lock — and go straight to
// minting a fresh session.
graceRecovery := false
var custom map[string]string
if res.Count == 0 {
// Genuinely not found / revoked / expired — a real bad token.
return "", "", "", 0, fmt.Errorf("invalid or expired refresh token")
}
var customClaimsJSON string
if len(res.Rows) > 0 && len(res.Rows[0]) > 0 {
if val, ok := res.Rows[0][0].(string); ok {
subject = val
} else {
b, _ := json.Marshal(res.Rows[0][0])
_ = json.Unmarshal(b, &subject)
gSubject, gCustom, gOK, gErr := s.tryRefreshReuseGrace(internalCtx, ormDB, nsID, hashedRefresh)
if gErr != nil {
// Transient rqlite error during the grace lookup/claim — retryable,
// not a verdict on the token (bugboard #125).
s.logger.ComponentWarn(logging.ComponentGeneral,
"refresh reuse-grace lookup failed (transient rqlite error, surfacing retryable)",
zap.String("namespace", namespace), zap.Error(gErr))
return "", "", "", 0, ErrRefreshTransient
}
// custom_claims (bugboard #548) — resolved once at login, replayed on
// every rotation so the refresh path never re-invokes the provider.
if len(res.Rows[0]) > 1 {
if cc, ok := res.Rows[0][1].(string); ok {
customClaimsJSON = cc
if !gOK {
// Genuinely not found / revoked outside grace / grace already
// consumed / expired — a real bad token.
return "", "", "", 0, fmt.Errorf("invalid or expired refresh token")
}
subject = gSubject
custom = gCustom
graceRecovery = true
s.logger.ComponentInfo(logging.ComponentGeneral,
"refresh token reuse-grace recovery (lost-response retry, single-use)",
zap.String("namespace", namespace), zap.String("subject", subject))
} else {
var customClaimsJSON string
if len(res.Rows) > 0 && len(res.Rows[0]) > 0 {
if val, ok := res.Rows[0][0].(string); ok {
subject = val
} else {
b, _ := json.Marshal(res.Rows[0][0])
_ = json.Unmarshal(b, &subject)
}
// custom_claims (bugboard #548) — resolved once at login, replayed on
// every rotation so the refresh path never re-invokes the provider.
if len(res.Rows[0]) > 1 {
if cc, ok := res.Rows[0][1].(string); ok {
customClaimsJSON = cc
}
}
}
custom = unmarshalClaims(customClaimsJSON)
}
custom := unmarshalClaims(customClaimsJSON)
// Step 2: atomic CAS — revoke the old row. RowsAffected is the lock.
// Two concurrent calls with the same refresh token: exactly one wins
// the UPDATE (RowsAffected == 1); the other sees RowsAffected == 0
// and bails with the replay tripwire.
updRes, err := s.db.Exec(internalCtx,
`UPDATE refresh_tokens SET revoked_at = datetime('now')
WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL`,
nsID, hashedRefresh)
if err != nil {
// rqlite write error (leader unavailable) — retryable, not a bad
// token. No row was revoked, so a client retry is safe (bugboard #125).
s.logger.ComponentWarn(logging.ComponentGeneral,
"refresh token revoke failed (transient rqlite error, surfacing retryable)",
zap.String("namespace", namespace),
zap.Error(err))
return "", "", "", 0, ErrRefreshTransient
}
affected, _ := updRes.RowsAffected()
if affected == 0 {
// Race lost OR replay attempt: token was unrevoked at step 1 but
// already revoked by step 2, meaning a concurrent call rotated it
// in between. Could be benign (same client retrying due to a
// transient network error) or malicious (stolen token + race).
// Either way: fail closed, log it, let the operator investigate.
s.logger.ComponentWarn(logging.ComponentGeneral,
"refresh token rotation: concurrent use detected (possible replay)",
zap.String("namespace", namespace),
zap.String("subject", subject))
return "", "", "", 0, ErrRefreshTokenReplay
//
// Skipped on a grace recovery (bugboard #125): the token is ALREADY
// revoked, so this CAS would always see RowsAffected == 0 and mis-fire the
// replay tripwire. The single-use grace CAS (grace_used_at) inside
// tryRefreshReuseGrace already served as the lock for this path.
if !graceRecovery {
updRes, err := s.db.Exec(internalCtx,
`UPDATE refresh_tokens SET revoked_at = datetime('now')
WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL`,
nsID, hashedRefresh)
if err != nil {
// rqlite write error (leader unavailable) — retryable, not a bad
// token. No row was revoked, so a client retry is safe (bugboard #125).
s.logger.ComponentWarn(logging.ComponentGeneral,
"refresh token revoke failed (transient rqlite error, surfacing retryable)",
zap.String("namespace", namespace),
zap.Error(err))
return "", "", "", 0, ErrRefreshTransient
}
affected, _ := updRes.RowsAffected()
if affected == 0 {
// Race lost OR replay attempt: token was unrevoked at step 1 but
// already revoked by step 2, meaning a concurrent call rotated it
// in between. Could be benign (same client retrying due to a
// transient network error) or malicious (stolen token + race).
// Either way: fail closed, log it, let the operator investigate.
s.logger.ComponentWarn(logging.ComponentGeneral,
"refresh token rotation: concurrent use detected (possible replay)",
zap.String("namespace", namespace),
zap.String("subject", subject))
return "", "", "", 0, ErrRefreshTokenReplay
}
}
// Step 3: mint the new access JWT, carrying forward the stored custom
@ -501,6 +542,74 @@ func (s *Service) RefreshToken(ctx context.Context, refreshToken, namespace stri
return accessToken, newRefreshToken, subject, expUnix, nil
}
// tryRefreshReuseGrace implements the bounded, single-use reuse grace for a
// rotated refresh token (bugboard #125, RFC 9700 §4.13.2). A token revoked
// within refreshReuseGrace whose grace_used_at is still NULL is accepted ONCE
// more — recovering a client that lost its rotation response in transit (a
// reconnect storm during a gateway roll) before it dead-ends in a 401 → SIWE.
//
// Returns (subject, custom, true, nil) on a successful single-use grace claim;
// (—, —, false, nil) when there is no eligible row, the token was revoked
// outside the grace window, it has expired, or the grace was already consumed
// (caller → 401). A non-nil error is a transient rqlite failure (caller → 503).
//
// Security: the grace is both short-windowed AND single-use (a CAS on
// grace_used_at), so a stolen token cannot be replayed repeatedly; and it never
// touches the concurrent-rotation replay tripwire, which fires on the active
// path only.
func (s *Service) tryRefreshReuseGrace(ctx context.Context, ormDB client.DatabaseClient, nsID interface{}, hashedRefresh string) (subject string, custom map[string]string, ok bool, err error) {
graceArg := fmt.Sprintf("-%d seconds", int(refreshReuseGrace.Seconds()))
sel := `SELECT subject, custom_claims FROM refresh_tokens
WHERE namespace_id = ? AND token = ?
AND revoked_at IS NOT NULL
AND revoked_at > datetime('now', ?)
AND grace_used_at IS NULL
AND (expires_at IS NULL OR expires_at > datetime('now'))
LIMIT 1`
res, qerr := ormDB.Query(ctx, sel, nsID, hashedRefresh, graceArg)
if qerr != nil {
return "", nil, false, qerr // transient rqlite error → caller 503
}
if res == nil || res.Count == 0 {
return "", nil, false, nil // no eligible grace row → caller 401
}
var customClaimsJSON string
if len(res.Rows) > 0 && len(res.Rows[0]) > 0 {
if v, vok := res.Rows[0][0].(string); vok {
subject = v
} else {
b, _ := json.Marshal(res.Rows[0][0])
_ = json.Unmarshal(b, &subject)
}
if len(res.Rows[0]) > 1 {
if cc, cok := res.Rows[0][1].(string); cok {
customClaimsJSON = cc
}
}
}
if subject == "" {
return "", nil, false, nil // defensive: never grace-mint an anonymous session
}
// Single-use CAS: claim the grace. Exactly one caller wins; a concurrent
// replay of the same just-revoked token sees RowsAffected == 0 → no grace.
// The same time-window predicate is repeated so the claim can't succeed on a
// row that aged out of the window between the SELECT and here.
updRes, uerr := s.db.Exec(ctx,
`UPDATE refresh_tokens SET grace_used_at = datetime('now')
WHERE namespace_id = ? AND token = ? AND grace_used_at IS NULL
AND revoked_at IS NOT NULL AND revoked_at > datetime('now', ?)`,
nsID, hashedRefresh, graceArg)
if uerr != nil {
return "", nil, false, uerr // transient
}
if affected, _ := updRes.RowsAffected(); affected == 0 {
return "", nil, false, nil // grace already consumed (concurrent) → caller 401
}
return subject, unmarshalClaims(customClaimsJSON), true, nil
}
// RevokeToken revokes a specific refresh token or all tokens for a subject
func (s *Service) RevokeToken(ctx context.Context, namespace, token string, all bool, subject string) error {
internalCtx := client.WithInternalAuth(ctx)
@ -511,14 +620,20 @@ func (s *Service) RevokeToken(ctx context.Context, namespace, token string, all
return err
}
// Explicit revocation (logout / revoke-all) ALSO burns the reuse-grace slot
// (grace_used_at) so a deliberately-revoked token can NEVER be recovered by
// the bugboard #125 reuse grace. Rotation does not go through RevokeToken,
// so the legitimate lost-response grace path is unaffected; this only closes
// the logout-bypass where a just-logged-out token would otherwise be
// grace-eligible for the 60s window.
if token != "" {
hashedToken := sha256Hex(token)
_, err := db.Query(internalCtx, "UPDATE refresh_tokens SET revoked_at = datetime('now') WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL", nsID, hashedToken)
_, err := db.Query(internalCtx, "UPDATE refresh_tokens SET revoked_at = datetime('now'), grace_used_at = datetime('now') WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL", nsID, hashedToken)
return err
}
if all && subject != "" {
_, err := db.Query(internalCtx, "UPDATE refresh_tokens SET revoked_at = datetime('now') WHERE namespace_id = ? AND subject = ? AND revoked_at IS NULL", nsID, subject)
_, err := db.Query(internalCtx, "UPDATE refresh_tokens SET revoked_at = datetime('now'), grace_used_at = datetime('now') WHERE namespace_id = ? AND subject = ? AND revoked_at IS NULL", nsID, subject)
return err
}