mirror of
https://github.com/DeBrosOfficial/orama.git
synced 2026-06-17 10:14:13 +00:00
A lost rotation response strands the client on a just-revoked token: the retry hits res.Count==0 → genuine 401 → SIWE, which is impossible on a VoIP-woken locked screen, so the call dies. This recurred under the reconnect storms from today's gateway rolls. Add an RFC 9700 §4.13.2 reuse grace: a refresh token revoked within 60s whose grace_used_at is still NULL is accepted ONCE more and mints a fresh session. The grace path skips the revoke CAS (the token is already revoked — the CAS would 0-match and mis-fire the replay tripwire) and is locked instead by a single-use CAS on grace_used_at, so a stolen token can't be replayed at leisure. The window predicate is repeated on the CAS to close the SELECT→UPDATE TOCTOU, and the grace SELECT excludes expired tokens. Security (found + fixed in review): explicit revocation (RevokeToken / /v1/auth/logout) now also stamps grace_used_at, so a deliberately-logged-out token can never be grace-recovered — closes a logout-bypass where a just- revoked token would otherwise be resurrectable for 60s. Transient rqlite errors on the grace lookup/CAS surface as 503 (retryable), not 401, preserving the #125 transient-vs-genuine distinction. Migration 032 adds grace_used_at (additive ALTER, rolling-safe; NULL = grace available, the window predicate keeps historically-revoked tokens ineligible). Dual-reviewed: code-quality APPROVED; security SECURE after the logout-bypass fix. Tests: lost-response recovery, single-use second-attempt 401, genuine bad token 401, and the logout-bypass regression.
773 lines
31 KiB
Go
773 lines
31 KiB
Go
package auth
|
||
|
||
import (
|
||
"context"
|
||
"crypto/ed25519"
|
||
"crypto/rand"
|
||
"crypto/rsa"
|
||
"crypto/sha256"
|
||
"crypto/x509"
|
||
"encoding/base64"
|
||
"encoding/hex"
|
||
"encoding/json"
|
||
"encoding/pem"
|
||
"fmt"
|
||
"math/big"
|
||
"strconv"
|
||
"strings"
|
||
"time"
|
||
|
||
"github.com/DeBrosOfficial/network/pkg/client"
|
||
"github.com/DeBrosOfficial/network/pkg/logging"
|
||
"github.com/DeBrosOfficial/network/pkg/rqlite"
|
||
ethcrypto "github.com/ethereum/go-ethereum/crypto"
|
||
"go.uber.org/zap"
|
||
)
|
||
|
||
// Service handles authentication business logic
|
||
type Service struct {
|
||
logger *logging.ColoredLogger
|
||
orm client.NetworkClient
|
||
db rqlite.Client // lower-level client; used where rows-affected is needed (e.g. refresh-token CAS rotation, feature #68)
|
||
signingKey *rsa.PrivateKey
|
||
keyID string
|
||
edSigningKey ed25519.PrivateKey
|
||
edKeyID string
|
||
preferEdDSA bool
|
||
defaultNS string
|
||
apiKeyHMACSecret string // HMAC secret for hashing API keys before storage
|
||
claimsResolver ClaimsResolver // namespace claims-provider hook (bugboard #548); nil = none
|
||
}
|
||
|
||
func NewService(logger *logging.ColoredLogger, orm client.NetworkClient, signingKeyPEM string, defaultNS string) (*Service, error) {
|
||
s := &Service{
|
||
logger: logger,
|
||
orm: orm,
|
||
defaultNS: defaultNS,
|
||
}
|
||
|
||
if signingKeyPEM != "" {
|
||
block, _ := pem.Decode([]byte(signingKeyPEM))
|
||
if block == nil {
|
||
return nil, fmt.Errorf("failed to parse signing key PEM")
|
||
}
|
||
key, err := x509.ParsePKCS1PrivateKey(block.Bytes)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("failed to parse RSA private key: %w", err)
|
||
}
|
||
s.signingKey = key
|
||
|
||
// Generate a simple KID from the public key hash
|
||
pubBytes := x509.MarshalPKCS1PublicKey(&key.PublicKey)
|
||
sum := sha256.Sum256(pubBytes)
|
||
s.keyID = hex.EncodeToString(sum[:8])
|
||
}
|
||
|
||
return s, nil
|
||
}
|
||
|
||
// SetAPIKeyHMACSecret configures the HMAC secret used to hash API keys before storage.
|
||
// When set, API keys are stored as HMAC-SHA256(key, secret) in the database.
|
||
func (s *Service) SetAPIKeyHMACSecret(secret string) {
|
||
s.apiKeyHMACSecret = secret
|
||
}
|
||
|
||
// SetRqliteClient injects the lower-level rqlite client. Required for code
|
||
// paths that need rows-affected feedback for compare-and-swap operations
|
||
// (e.g. atomic refresh-token rotation, feature #68). The higher-level
|
||
// `client.NetworkClient` interface in `s.orm` does not expose RowsAffected
|
||
// on writes.
|
||
//
|
||
// Safe to call zero or one times; idempotent. Without it, methods that
|
||
// depend on CAS semantics fall back to the previous less-atomic behaviour
|
||
// (currently: RefreshToken returns ErrRotationNotConfigured).
|
||
func (s *Service) SetRqliteClient(db rqlite.Client) {
|
||
s.db = db
|
||
}
|
||
|
||
// ClaimsResolver resolves additive, namespace-defined JWT custom claims for an
|
||
// authenticated wallet at token-mint time (bugboard #548/#920). The concrete
|
||
// implementation invokes the namespace's reserved `auth-claims-provider`
|
||
// serverless function; it MUST be fail-open (return nil, never error) so a
|
||
// missing/slow/broken provider never breaks authentication. Injected via
|
||
// SetClaimsResolver; nil = no custom claims (every namespace's default).
|
||
type ClaimsResolver interface {
|
||
ResolveClaims(ctx context.Context, wallet, namespace string) map[string]string
|
||
}
|
||
|
||
// SetClaimsResolver wires the namespace claims-provider hook used at mint time.
|
||
func (s *Service) SetClaimsResolver(r ClaimsResolver) { s.claimsResolver = r }
|
||
|
||
// resolveCustomClaims returns the namespace's additive claims for this wallet,
|
||
// or nil. Fail-open by contract — the resolver never errors.
|
||
func (s *Service) resolveCustomClaims(ctx context.Context, wallet, namespace string) map[string]string {
|
||
if s.claimsResolver == nil {
|
||
return nil
|
||
}
|
||
return s.claimsResolver.ResolveClaims(ctx, wallet, namespace)
|
||
}
|
||
|
||
// ErrRotationNotConfigured is returned by RefreshToken when the service
|
||
// wasn't given an rqlite client — refusing to rotate without atomicity
|
||
// guarantees is safer than rotating non-atomically.
|
||
var ErrRotationNotConfigured = fmt.Errorf("auth service not configured for atomic refresh-token rotation (missing rqlite client)")
|
||
|
||
// HashAPIKey returns the HMAC-SHA256 hash of an API key if the HMAC secret is set,
|
||
// or returns the raw key for backward compatibility during rolling upgrade.
|
||
func (s *Service) HashAPIKey(key string) string {
|
||
if s.apiKeyHMACSecret == "" {
|
||
return key
|
||
}
|
||
return HmacSHA256Hex(key, s.apiKeyHMACSecret)
|
||
}
|
||
|
||
// SetEdDSAKey configures an Ed25519 signing key for EdDSA JWT support.
|
||
// When set, new tokens are signed with EdDSA; RS256 is still accepted for verification.
|
||
func (s *Service) SetEdDSAKey(privKey ed25519.PrivateKey) {
|
||
s.edSigningKey = privKey
|
||
pubBytes := []byte(privKey.Public().(ed25519.PublicKey))
|
||
sum := sha256.Sum256(pubBytes)
|
||
s.edKeyID = "ed_" + hex.EncodeToString(sum[:8])
|
||
s.preferEdDSA = true
|
||
}
|
||
|
||
// CreateNonce generates a new nonce and stores it in the database
|
||
func (s *Service) CreateNonce(ctx context.Context, wallet, purpose, namespace string) (string, error) {
|
||
// Generate a URL-safe random nonce (32 bytes)
|
||
buf := make([]byte, 32)
|
||
if _, err := rand.Read(buf); err != nil {
|
||
return "", fmt.Errorf("failed to generate nonce: %w", err)
|
||
}
|
||
nonce := base64.RawURLEncoding.EncodeToString(buf)
|
||
|
||
// Use internal context to bypass authentication for system operations
|
||
internalCtx := client.WithInternalAuth(ctx)
|
||
db := s.orm.Database()
|
||
|
||
if namespace == "" {
|
||
namespace = s.defaultNS
|
||
if namespace == "" {
|
||
namespace = "default"
|
||
}
|
||
}
|
||
|
||
// Ensure namespace exists
|
||
if _, err := db.Query(internalCtx, "INSERT OR IGNORE INTO namespaces(name) VALUES (?)", namespace); err != nil {
|
||
return "", fmt.Errorf("failed to ensure namespace: %w", err)
|
||
}
|
||
|
||
nsID, err := s.ResolveNamespaceID(ctx, namespace)
|
||
if err != nil {
|
||
return "", fmt.Errorf("failed to resolve namespace ID: %w", err)
|
||
}
|
||
|
||
// Store nonce with 5 minute expiry
|
||
walletLower := strings.ToLower(strings.TrimSpace(wallet))
|
||
if _, err := db.Query(internalCtx,
|
||
"INSERT INTO nonces(namespace_id, wallet, nonce, purpose, expires_at) VALUES (?, ?, ?, ?, datetime('now', '+5 minutes'))",
|
||
nsID, walletLower, nonce, purpose,
|
||
); err != nil {
|
||
return "", fmt.Errorf("failed to store nonce: %w", err)
|
||
}
|
||
|
||
return nonce, nil
|
||
}
|
||
|
||
// VerifySignature verifies a wallet signature for a given nonce
|
||
func (s *Service) VerifySignature(ctx context.Context, wallet, nonce, signature, chainType string) (bool, error) {
|
||
chainType = strings.ToUpper(strings.TrimSpace(chainType))
|
||
if chainType == "" {
|
||
chainType = "ETH"
|
||
}
|
||
|
||
switch chainType {
|
||
case "ETH":
|
||
return s.verifyEthSignature(wallet, nonce, signature)
|
||
case "SOL":
|
||
return s.verifySolSignature(wallet, nonce, signature)
|
||
default:
|
||
return false, fmt.Errorf("unsupported chain type: %s", chainType)
|
||
}
|
||
}
|
||
|
||
func (s *Service) verifyEthSignature(wallet, nonce, signature string) (bool, error) {
|
||
msg := []byte(nonce)
|
||
prefix := []byte("\x19Ethereum Signed Message:\n" + strconv.Itoa(len(msg)))
|
||
hash := ethcrypto.Keccak256(prefix, msg)
|
||
|
||
sigHex := strings.TrimSpace(signature)
|
||
if strings.HasPrefix(sigHex, "0x") || strings.HasPrefix(sigHex, "0X") {
|
||
sigHex = sigHex[2:]
|
||
}
|
||
sig, err := hex.DecodeString(sigHex)
|
||
if err != nil || len(sig) != 65 {
|
||
return false, fmt.Errorf("invalid signature format")
|
||
}
|
||
|
||
if sig[64] >= 27 {
|
||
sig[64] -= 27
|
||
}
|
||
|
||
pub, err := ethcrypto.SigToPub(hash, sig)
|
||
if err != nil {
|
||
return false, fmt.Errorf("signature recovery failed: %w", err)
|
||
}
|
||
|
||
addr := ethcrypto.PubkeyToAddress(*pub).Hex()
|
||
want := strings.ToLower(strings.TrimPrefix(strings.TrimPrefix(wallet, "0x"), "0X"))
|
||
got := strings.ToLower(strings.TrimPrefix(strings.TrimPrefix(addr, "0x"), "0X"))
|
||
|
||
return got == want, nil
|
||
}
|
||
|
||
func (s *Service) verifySolSignature(wallet, nonce, signature string) (bool, error) {
|
||
sig, err := base64.StdEncoding.DecodeString(signature)
|
||
if err != nil {
|
||
return false, fmt.Errorf("invalid base64 signature: %w", err)
|
||
}
|
||
if len(sig) != 64 {
|
||
return false, fmt.Errorf("invalid signature length: expected 64 bytes, got %d", len(sig))
|
||
}
|
||
|
||
pubKeyBytes, err := s.Base58Decode(wallet)
|
||
if err != nil {
|
||
return false, fmt.Errorf("invalid wallet address: %w", err)
|
||
}
|
||
if len(pubKeyBytes) != 32 {
|
||
return false, fmt.Errorf("invalid public key length: expected 32 bytes, got %d", len(pubKeyBytes))
|
||
}
|
||
|
||
message := []byte(nonce)
|
||
return ed25519.Verify(ed25519.PublicKey(pubKeyBytes), message, sig), nil
|
||
}
|
||
|
||
// IssueTokens generates access and refresh tokens for a verified wallet
|
||
func (s *Service) IssueTokens(ctx context.Context, wallet, namespace string) (string, string, int64, error) {
|
||
if s.signingKey == nil {
|
||
return "", "", 0, fmt.Errorf("signing key unavailable")
|
||
}
|
||
|
||
// Resolve namespace-defined additive claims (bugboard #548) ONCE at mint
|
||
// time. Stored with the refresh token below and replayed across rotations
|
||
// so the 15-min refresh path never re-invokes the provider.
|
||
custom := s.resolveCustomClaims(ctx, wallet, namespace)
|
||
|
||
// Issue access token (15m)
|
||
token, expUnix, err := s.GenerateJWT(namespace, wallet, 15*time.Minute, custom)
|
||
if err != nil {
|
||
return "", "", 0, fmt.Errorf("failed to generate JWT: %w", err)
|
||
}
|
||
|
||
// Create refresh token (30d)
|
||
rbuf := make([]byte, 32)
|
||
if _, err := rand.Read(rbuf); err != nil {
|
||
return "", "", 0, fmt.Errorf("failed to generate refresh token: %w", err)
|
||
}
|
||
refresh := base64.RawURLEncoding.EncodeToString(rbuf)
|
||
|
||
nsID, err := s.ResolveNamespaceID(ctx, namespace)
|
||
if err != nil {
|
||
return "", "", 0, fmt.Errorf("failed to resolve namespace ID: %w", err)
|
||
}
|
||
|
||
internalCtx := client.WithInternalAuth(ctx)
|
||
db := s.orm.Database()
|
||
hashedRefresh := sha256Hex(refresh)
|
||
if _, err := db.Query(internalCtx,
|
||
"INSERT INTO refresh_tokens(namespace_id, subject, token, audience, expires_at, custom_claims) VALUES (?, ?, ?, ?, datetime('now', '+30 days'), ?)",
|
||
nsID, wallet, hashedRefresh, "gateway", marshalClaims(custom),
|
||
); err != nil {
|
||
return "", "", 0, fmt.Errorf("failed to store refresh token: %w", err)
|
||
}
|
||
|
||
return token, refresh, expUnix, nil
|
||
}
|
||
|
||
// ErrRefreshTokenReplay is returned when a refresh token's CAS lock is lost —
|
||
// the row was already revoked between our read and our write, meaning either
|
||
// another concurrent request rotated it OR an attacker is replaying a stolen
|
||
// token after the legitimate client refreshed. Callers should treat this as
|
||
// a potential security event and surface 401 to the client; the service
|
||
// itself emits a WARN log so operators can audit.
|
||
//
|
||
// This is the tripwire promised by RFC 9700 §4.12 (refresh-token rotation).
|
||
var ErrRefreshTokenReplay = fmt.Errorf("refresh token already rotated or invalid")
|
||
|
||
// ErrRefreshTransient is returned when refresh-token rotation fails for a
|
||
// RETRYABLE reason — an rqlite-layer error rather than a genuine bad/expired
|
||
// token. Bugboard #125: during a rolling gateway restart the rqlite leader is
|
||
// briefly unavailable (re-election window), so the lookup/rotation errors;
|
||
// collapsing that into "invalid token" forces a 401 → full SIWE re-auth, which
|
||
// is impossible on a locked device answering a VoIP-woken call. Callers MUST
|
||
// surface this as a retryable 503, NOT a 401, so the client retries within the
|
||
// ring window instead of tearing down the session.
|
||
var ErrRefreshTransient = fmt.Errorf("refresh token rotation temporarily unavailable")
|
||
|
||
const (
|
||
// refreshSelectRetries bounds how many times the refresh lookup is retried
|
||
// when the rqlite read errors (transient leader unavailability). The read
|
||
// is idempotent and happens BEFORE any write, so retrying is safe.
|
||
refreshSelectRetries = 3
|
||
// refreshSelectRetryDelay is the backoff between lookup retries. Three
|
||
// tries × 250ms rides out a brief leader re-election without adding
|
||
// meaningful latency to the common (healthy-leader) path.
|
||
refreshSelectRetryDelay = 250 * time.Millisecond
|
||
// refreshReuseGrace is how long after a refresh token is rotated (revoked)
|
||
// the gateway will still accept it ONE more time, to recover a client whose
|
||
// rotation response was lost in transit — otherwise the retry dead-ends in a
|
||
// 401 → SIWE, impossible on a VoIP-woken locked screen (bugboard #125, RFC
|
||
// 9700 §4.13.2). Kept short, and single-use via grace_used_at, so a stolen
|
||
// token cannot be replayed at leisure.
|
||
refreshReuseGrace = 60 * time.Second
|
||
)
|
||
|
||
// RefreshToken validates the supplied refresh token, atomically rotates it
|
||
// (revokes the old, mints a new), and returns a fresh access token alongside
|
||
// the rotated refresh token.
|
||
//
|
||
// Rotation is the RFC 9700 BCP §4.12 / feature #68 behaviour:
|
||
//
|
||
// 1. SELECT the subject for the supplied token (must be unrevoked + unexpired)
|
||
// 2. UPDATE revoked_at = now() WHERE token = ? AND revoked_at IS NULL
|
||
// -- this is the atomic CAS. If RowsAffected == 0, the race was lost
|
||
// -- (concurrent rotation or token-replay attack); we fail closed and
|
||
// -- emit a security log line so operators can investigate.
|
||
// 3. Generate a fresh refresh-token + fresh access JWT
|
||
// 4. INSERT the new refresh-token row
|
||
// 5. Return both
|
||
//
|
||
// Failure modes:
|
||
// - Token invalid/expired at step 1 → standard "invalid or expired" error,
|
||
// no security event.
|
||
// - CAS lost at step 2 → ErrRefreshTokenReplay, WARN logged with subject +
|
||
// namespace. The client sees 401.
|
||
// - Crash between step 2 and step 4 → user is left with revoked old + no
|
||
// new, forcing re-login. Acceptable: degrades to re-auth, never enables
|
||
// double-use of a single refresh token.
|
||
//
|
||
// Returns:
|
||
//
|
||
// accessToken — newly minted short-lived JWT (15 min)
|
||
// newRefreshToken — newly minted long-lived refresh token (30 days)
|
||
// subject — wallet/subject claim of the refreshed session
|
||
// expUnix — access token expiry (unix seconds)
|
||
// err — non-nil on any failure; ErrRefreshTokenReplay for CAS loss
|
||
func (s *Service) RefreshToken(ctx context.Context, refreshToken, namespace string) (accessToken, newRefreshToken, subject string, expUnix int64, err error) {
|
||
// Atomic rotation requires the lower-level rqlite client (RowsAffected
|
||
// feedback isn't exposed by the higher-level client.NetworkClient).
|
||
// Refuse to rotate non-atomically — see ErrRotationNotConfigured.
|
||
if s.db == nil {
|
||
return "", "", "", 0, ErrRotationNotConfigured
|
||
}
|
||
|
||
internalCtx := client.WithInternalAuth(ctx)
|
||
ormDB := s.orm.Database()
|
||
|
||
nsID, err := s.ResolveNamespaceID(ctx, namespace)
|
||
if err != nil {
|
||
// Bugboard #125: namespace resolution runs an rqlite query BEFORE the
|
||
// token lookup, so a leader re-election during a rolling restart fails
|
||
// here too. Treat it as retryable (→ 503), not a bad token (→ 401) —
|
||
// the refresh-path namespace comes from an already-authenticated
|
||
// session, so a resolution failure is a transient DB error, never the
|
||
// client's fault.
|
||
s.logger.ComponentWarn(logging.ComponentGeneral,
|
||
"refresh namespace resolution failed (transient, surfacing retryable)",
|
||
zap.String("namespace", namespace),
|
||
zap.Error(err))
|
||
return "", "", "", 0, ErrRefreshTransient
|
||
}
|
||
|
||
hashedRefresh := sha256Hex(refreshToken)
|
||
|
||
// Step 1: read the subject. Tells us who the token belongs to AND
|
||
// validates that it's currently usable (not revoked, not expired).
|
||
//
|
||
// Bugboard #125: distinguish a TRANSIENT rqlite error (leader briefly
|
||
// unavailable during a rolling restart) from a GENUINE token miss. The
|
||
// read is idempotent and pre-write, so we retry it a few times; only after
|
||
// exhausting retries do we surface ErrRefreshTransient (→ 503, client
|
||
// retries). An actual empty result (Count == 0) is a real bad/expired
|
||
// token → "invalid or expired" (→ 401). Collapsing the two used to 401 a
|
||
// valid session during every restart, defeating the VoIP-wake refresh.
|
||
selectQ := `SELECT subject, custom_claims FROM refresh_tokens
|
||
WHERE namespace_id = ? AND token = ?
|
||
AND revoked_at IS NULL
|
||
AND (expires_at IS NULL OR expires_at > datetime('now'))
|
||
LIMIT 1`
|
||
var res *client.QueryResult
|
||
var selErr error
|
||
for attempt := 0; attempt < refreshSelectRetries; attempt++ {
|
||
res, selErr = ormDB.Query(internalCtx, selectQ, nsID, hashedRefresh)
|
||
if selErr == nil && res != nil {
|
||
break
|
||
}
|
||
if attempt < refreshSelectRetries-1 {
|
||
time.Sleep(refreshSelectRetryDelay)
|
||
}
|
||
}
|
||
if selErr != nil || res == nil {
|
||
// rqlite error persisted across retries — leader likely mid-election.
|
||
// Retryable, NOT an invalid token.
|
||
s.logger.ComponentWarn(logging.ComponentGeneral,
|
||
"refresh token lookup failed (transient rqlite error, surfacing retryable)",
|
||
zap.String("namespace", namespace),
|
||
zap.Error(selErr))
|
||
return "", "", "", 0, ErrRefreshTransient
|
||
}
|
||
// graceRecovery is set when the presented token was NOT in the active set
|
||
// but qualifies for the bugboard #125 single-use reuse grace (a just-
|
||
// rotated token whose rotation response was lost). In that case the old row
|
||
// is already revoked, so we SKIP the revoke CAS (step 2) — the grace CAS
|
||
// inside tryRefreshReuseGrace is our single-use lock — and go straight to
|
||
// minting a fresh session.
|
||
graceRecovery := false
|
||
var custom map[string]string
|
||
if res.Count == 0 {
|
||
gSubject, gCustom, gOK, gErr := s.tryRefreshReuseGrace(internalCtx, ormDB, nsID, hashedRefresh)
|
||
if gErr != nil {
|
||
// Transient rqlite error during the grace lookup/claim — retryable,
|
||
// not a verdict on the token (bugboard #125).
|
||
s.logger.ComponentWarn(logging.ComponentGeneral,
|
||
"refresh reuse-grace lookup failed (transient rqlite error, surfacing retryable)",
|
||
zap.String("namespace", namespace), zap.Error(gErr))
|
||
return "", "", "", 0, ErrRefreshTransient
|
||
}
|
||
if !gOK {
|
||
// Genuinely not found / revoked outside grace / grace already
|
||
// consumed / expired — a real bad token.
|
||
return "", "", "", 0, fmt.Errorf("invalid or expired refresh token")
|
||
}
|
||
subject = gSubject
|
||
custom = gCustom
|
||
graceRecovery = true
|
||
s.logger.ComponentInfo(logging.ComponentGeneral,
|
||
"refresh token reuse-grace recovery (lost-response retry, single-use)",
|
||
zap.String("namespace", namespace), zap.String("subject", subject))
|
||
} else {
|
||
var customClaimsJSON string
|
||
if len(res.Rows) > 0 && len(res.Rows[0]) > 0 {
|
||
if val, ok := res.Rows[0][0].(string); ok {
|
||
subject = val
|
||
} else {
|
||
b, _ := json.Marshal(res.Rows[0][0])
|
||
_ = json.Unmarshal(b, &subject)
|
||
}
|
||
// custom_claims (bugboard #548) — resolved once at login, replayed on
|
||
// every rotation so the refresh path never re-invokes the provider.
|
||
if len(res.Rows[0]) > 1 {
|
||
if cc, ok := res.Rows[0][1].(string); ok {
|
||
customClaimsJSON = cc
|
||
}
|
||
}
|
||
}
|
||
custom = unmarshalClaims(customClaimsJSON)
|
||
}
|
||
|
||
// Step 2: atomic CAS — revoke the old row. RowsAffected is the lock.
|
||
// Two concurrent calls with the same refresh token: exactly one wins
|
||
// the UPDATE (RowsAffected == 1); the other sees RowsAffected == 0
|
||
// and bails with the replay tripwire.
|
||
//
|
||
// Skipped on a grace recovery (bugboard #125): the token is ALREADY
|
||
// revoked, so this CAS would always see RowsAffected == 0 and mis-fire the
|
||
// replay tripwire. The single-use grace CAS (grace_used_at) inside
|
||
// tryRefreshReuseGrace already served as the lock for this path.
|
||
if !graceRecovery {
|
||
updRes, err := s.db.Exec(internalCtx,
|
||
`UPDATE refresh_tokens SET revoked_at = datetime('now')
|
||
WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL`,
|
||
nsID, hashedRefresh)
|
||
if err != nil {
|
||
// rqlite write error (leader unavailable) — retryable, not a bad
|
||
// token. No row was revoked, so a client retry is safe (bugboard #125).
|
||
s.logger.ComponentWarn(logging.ComponentGeneral,
|
||
"refresh token revoke failed (transient rqlite error, surfacing retryable)",
|
||
zap.String("namespace", namespace),
|
||
zap.Error(err))
|
||
return "", "", "", 0, ErrRefreshTransient
|
||
}
|
||
affected, _ := updRes.RowsAffected()
|
||
if affected == 0 {
|
||
// Race lost OR replay attempt: token was unrevoked at step 1 but
|
||
// already revoked by step 2, meaning a concurrent call rotated it
|
||
// in between. Could be benign (same client retrying due to a
|
||
// transient network error) or malicious (stolen token + race).
|
||
// Either way: fail closed, log it, let the operator investigate.
|
||
s.logger.ComponentWarn(logging.ComponentGeneral,
|
||
"refresh token rotation: concurrent use detected (possible replay)",
|
||
zap.String("namespace", namespace),
|
||
zap.String("subject", subject))
|
||
return "", "", "", 0, ErrRefreshTokenReplay
|
||
}
|
||
}
|
||
|
||
// Step 3: mint the new access JWT, carrying forward the stored custom
|
||
// claims so a rotated token keeps the same account_id etc. (bugboard #548).
|
||
accessToken, expUnix, err = s.GenerateJWT(namespace, subject, 15*time.Minute, custom)
|
||
if err != nil {
|
||
return "", "", "", 0, fmt.Errorf("generate access token: %w", err)
|
||
}
|
||
|
||
// Step 4: mint and persist a new refresh token (32-byte random,
|
||
// base64-url-encoded; stored hashed). 30-day TTL. Note: if this
|
||
// INSERT fails after the UPDATE succeeded (step 2), the user is left
|
||
// with revoked old + no new and must re-authenticate. Acceptable —
|
||
// degrades to re-auth, never to double-use of a single refresh token.
|
||
rbuf := make([]byte, 32)
|
||
if _, err := rand.Read(rbuf); err != nil {
|
||
return "", "", "", 0, fmt.Errorf("generate refresh token: %w", err)
|
||
}
|
||
newRefreshToken = base64.RawURLEncoding.EncodeToString(rbuf)
|
||
hashedNew := sha256Hex(newRefreshToken)
|
||
// Re-marshal from the parsed map (not the raw stored string) so the new
|
||
// row and the freshly-minted access token are provably consistent and
|
||
// self-healing — a malformed stored blob converges to "" on both sides
|
||
// rather than being propagated forward verbatim. custom_claims is written
|
||
// ONLY here and in IssueTokens, both from a sanitized map (bugboard #548).
|
||
if _, err := ormDB.Query(internalCtx,
|
||
"INSERT INTO refresh_tokens(namespace_id, subject, token, audience, expires_at, custom_claims) VALUES (?, ?, ?, ?, datetime('now', '+30 days'), ?)",
|
||
nsID, subject, hashedNew, "gateway", marshalClaims(custom)); err != nil {
|
||
// The old token is already revoked (step 2). A retryable error here
|
||
// leaves the client to re-attempt — which will re-auth since the old
|
||
// token is gone — but that's strictly better than masking a transient
|
||
// failure as a permanent 401 (bugboard #125). Surface retryable.
|
||
s.logger.ComponentWarn(logging.ComponentGeneral,
|
||
"refresh token store failed after revoke (transient rqlite error)",
|
||
zap.String("namespace", namespace),
|
||
zap.Error(err))
|
||
return "", "", "", 0, ErrRefreshTransient
|
||
}
|
||
|
||
return accessToken, newRefreshToken, subject, expUnix, nil
|
||
}
|
||
|
||
// tryRefreshReuseGrace implements the bounded, single-use reuse grace for a
|
||
// rotated refresh token (bugboard #125, RFC 9700 §4.13.2). A token revoked
|
||
// within refreshReuseGrace whose grace_used_at is still NULL is accepted ONCE
|
||
// more — recovering a client that lost its rotation response in transit (a
|
||
// reconnect storm during a gateway roll) before it dead-ends in a 401 → SIWE.
|
||
//
|
||
// Returns (subject, custom, true, nil) on a successful single-use grace claim;
|
||
// (—, —, false, nil) when there is no eligible row, the token was revoked
|
||
// outside the grace window, it has expired, or the grace was already consumed
|
||
// (caller → 401). A non-nil error is a transient rqlite failure (caller → 503).
|
||
//
|
||
// Security: the grace is both short-windowed AND single-use (a CAS on
|
||
// grace_used_at), so a stolen token cannot be replayed repeatedly; and it never
|
||
// touches the concurrent-rotation replay tripwire, which fires on the active
|
||
// path only.
|
||
func (s *Service) tryRefreshReuseGrace(ctx context.Context, ormDB client.DatabaseClient, nsID interface{}, hashedRefresh string) (subject string, custom map[string]string, ok bool, err error) {
|
||
graceArg := fmt.Sprintf("-%d seconds", int(refreshReuseGrace.Seconds()))
|
||
sel := `SELECT subject, custom_claims FROM refresh_tokens
|
||
WHERE namespace_id = ? AND token = ?
|
||
AND revoked_at IS NOT NULL
|
||
AND revoked_at > datetime('now', ?)
|
||
AND grace_used_at IS NULL
|
||
AND (expires_at IS NULL OR expires_at > datetime('now'))
|
||
LIMIT 1`
|
||
res, qerr := ormDB.Query(ctx, sel, nsID, hashedRefresh, graceArg)
|
||
if qerr != nil {
|
||
return "", nil, false, qerr // transient rqlite error → caller 503
|
||
}
|
||
if res == nil || res.Count == 0 {
|
||
return "", nil, false, nil // no eligible grace row → caller 401
|
||
}
|
||
|
||
var customClaimsJSON string
|
||
if len(res.Rows) > 0 && len(res.Rows[0]) > 0 {
|
||
if v, vok := res.Rows[0][0].(string); vok {
|
||
subject = v
|
||
} else {
|
||
b, _ := json.Marshal(res.Rows[0][0])
|
||
_ = json.Unmarshal(b, &subject)
|
||
}
|
||
if len(res.Rows[0]) > 1 {
|
||
if cc, cok := res.Rows[0][1].(string); cok {
|
||
customClaimsJSON = cc
|
||
}
|
||
}
|
||
}
|
||
if subject == "" {
|
||
return "", nil, false, nil // defensive: never grace-mint an anonymous session
|
||
}
|
||
|
||
// Single-use CAS: claim the grace. Exactly one caller wins; a concurrent
|
||
// replay of the same just-revoked token sees RowsAffected == 0 → no grace.
|
||
// The same time-window predicate is repeated so the claim can't succeed on a
|
||
// row that aged out of the window between the SELECT and here.
|
||
updRes, uerr := s.db.Exec(ctx,
|
||
`UPDATE refresh_tokens SET grace_used_at = datetime('now')
|
||
WHERE namespace_id = ? AND token = ? AND grace_used_at IS NULL
|
||
AND revoked_at IS NOT NULL AND revoked_at > datetime('now', ?)`,
|
||
nsID, hashedRefresh, graceArg)
|
||
if uerr != nil {
|
||
return "", nil, false, uerr // transient
|
||
}
|
||
if affected, _ := updRes.RowsAffected(); affected == 0 {
|
||
return "", nil, false, nil // grace already consumed (concurrent) → caller 401
|
||
}
|
||
return subject, unmarshalClaims(customClaimsJSON), true, nil
|
||
}
|
||
|
||
// RevokeToken revokes a specific refresh token or all tokens for a subject
|
||
func (s *Service) RevokeToken(ctx context.Context, namespace, token string, all bool, subject string) error {
|
||
internalCtx := client.WithInternalAuth(ctx)
|
||
db := s.orm.Database()
|
||
|
||
nsID, err := s.ResolveNamespaceID(ctx, namespace)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
|
||
// Explicit revocation (logout / revoke-all) ALSO burns the reuse-grace slot
|
||
// (grace_used_at) so a deliberately-revoked token can NEVER be recovered by
|
||
// the bugboard #125 reuse grace. Rotation does not go through RevokeToken,
|
||
// so the legitimate lost-response grace path is unaffected; this only closes
|
||
// the logout-bypass where a just-logged-out token would otherwise be
|
||
// grace-eligible for the 60s window.
|
||
if token != "" {
|
||
hashedToken := sha256Hex(token)
|
||
_, err := db.Query(internalCtx, "UPDATE refresh_tokens SET revoked_at = datetime('now'), grace_used_at = datetime('now') WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL", nsID, hashedToken)
|
||
return err
|
||
}
|
||
|
||
if all && subject != "" {
|
||
_, err := db.Query(internalCtx, "UPDATE refresh_tokens SET revoked_at = datetime('now'), grace_used_at = datetime('now') WHERE namespace_id = ? AND subject = ? AND revoked_at IS NULL", nsID, subject)
|
||
return err
|
||
}
|
||
|
||
return fmt.Errorf("nothing to revoke")
|
||
}
|
||
|
||
// RegisterApp registers a new client application
|
||
func (s *Service) RegisterApp(ctx context.Context, wallet, namespace, name, publicKey string) (string, error) {
|
||
internalCtx := client.WithInternalAuth(ctx)
|
||
db := s.orm.Database()
|
||
|
||
nsID, err := s.ResolveNamespaceID(ctx, namespace)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
|
||
// Generate client app_id
|
||
buf := make([]byte, 12)
|
||
if _, err := rand.Read(buf); err != nil {
|
||
return "", fmt.Errorf("failed to generate app id: %w", err)
|
||
}
|
||
appID := "app_" + base64.RawURLEncoding.EncodeToString(buf)
|
||
|
||
// Persist app
|
||
if _, err := db.Query(internalCtx, "INSERT INTO apps(namespace_id, app_id, name, public_key) VALUES (?, ?, ?, ?)", nsID, appID, name, publicKey); err != nil {
|
||
return "", err
|
||
}
|
||
|
||
// Record ownership
|
||
_, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO namespace_ownership(namespace_id, owner_type, owner_id) VALUES (?, ?, ?)", nsID, "wallet", wallet)
|
||
|
||
return appID, nil
|
||
}
|
||
|
||
// GetOrCreateAPIKey returns an existing API key or creates a new one for a wallet in a namespace
|
||
func (s *Service) GetOrCreateAPIKey(ctx context.Context, wallet, namespace string) (string, error) {
|
||
internalCtx := client.WithInternalAuth(ctx)
|
||
db := s.orm.Database()
|
||
|
||
nsID, err := s.ResolveNamespaceID(ctx, namespace)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
|
||
// Try existing linkage
|
||
var apiKey string
|
||
r1, err := db.Query(internalCtx,
|
||
"SELECT api_keys.key FROM wallet_api_keys JOIN api_keys ON wallet_api_keys.api_key_id = api_keys.id WHERE wallet_api_keys.namespace_id = ? AND LOWER(wallet_api_keys.wallet) = LOWER(?) LIMIT 1",
|
||
nsID, wallet,
|
||
)
|
||
if err == nil && r1 != nil && r1.Count > 0 && len(r1.Rows) > 0 && len(r1.Rows[0]) > 0 {
|
||
if val, ok := r1.Rows[0][0].(string); ok {
|
||
apiKey = val
|
||
}
|
||
}
|
||
|
||
if apiKey != "" {
|
||
return apiKey, nil
|
||
}
|
||
|
||
// Create new API key
|
||
buf := make([]byte, 18)
|
||
if _, err := rand.Read(buf); err != nil {
|
||
return "", fmt.Errorf("failed to generate api key: %w", err)
|
||
}
|
||
apiKey = "ak_" + base64.RawURLEncoding.EncodeToString(buf) + ":" + namespace
|
||
|
||
// Store the HMAC hash of the key (not the raw key) if HMAC secret is configured
|
||
hashedKey := s.HashAPIKey(apiKey)
|
||
if _, err := db.Query(internalCtx, "INSERT INTO api_keys(key, name, namespace_id) VALUES (?, ?, ?)", hashedKey, "", nsID); err != nil {
|
||
return "", fmt.Errorf("failed to store api key: %w", err)
|
||
}
|
||
|
||
// Link wallet -> api_key
|
||
rid, err := db.Query(internalCtx, "SELECT id FROM api_keys WHERE key = ? LIMIT 1", hashedKey)
|
||
if err == nil && rid != nil && rid.Count > 0 && len(rid.Rows) > 0 && len(rid.Rows[0]) > 0 {
|
||
apiKeyID := rid.Rows[0][0]
|
||
_, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO wallet_api_keys(namespace_id, wallet, api_key_id) VALUES (?, ?, ?)", nsID, strings.ToLower(wallet), apiKeyID)
|
||
}
|
||
|
||
// Record ownerships — store the hash in ownership too
|
||
_, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO namespace_ownership(namespace_id, owner_type, owner_id) VALUES (?, 'api_key', ?)", nsID, hashedKey)
|
||
_, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO namespace_ownership(namespace_id, owner_type, owner_id) VALUES (?, 'wallet', ?)", nsID, wallet)
|
||
|
||
return apiKey, nil
|
||
}
|
||
|
||
// ResolveNamespaceID ensures the given namespace exists and returns its primary key ID.
|
||
func (s *Service) ResolveNamespaceID(ctx context.Context, ns string) (interface{}, error) {
|
||
if s.orm == nil {
|
||
return nil, fmt.Errorf("client not initialized")
|
||
}
|
||
ns = strings.TrimSpace(ns)
|
||
if ns == "" {
|
||
ns = "default"
|
||
}
|
||
|
||
internalCtx := client.WithInternalAuth(ctx)
|
||
db := s.orm.Database()
|
||
|
||
if _, err := db.Query(internalCtx, "INSERT OR IGNORE INTO namespaces(name) VALUES (?)", ns); err != nil {
|
||
return nil, err
|
||
}
|
||
res, err := db.Query(internalCtx, "SELECT id FROM namespaces WHERE name = ? LIMIT 1", ns)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if res == nil || res.Count == 0 || len(res.Rows) == 0 || len(res.Rows[0]) == 0 {
|
||
return nil, fmt.Errorf("failed to resolve namespace")
|
||
}
|
||
return res.Rows[0][0], nil
|
||
}
|
||
|
||
// Base58Decode decodes a base58-encoded string
|
||
func (s *Service) Base58Decode(input string) ([]byte, error) {
|
||
const alphabet = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"
|
||
answer := big.NewInt(0)
|
||
j := big.NewInt(1)
|
||
for i := len(input) - 1; i >= 0; i-- {
|
||
tmp := strings.IndexByte(alphabet, input[i])
|
||
if tmp == -1 {
|
||
return nil, fmt.Errorf("invalid base58 character")
|
||
}
|
||
idx := big.NewInt(int64(tmp))
|
||
tmp1 := new(big.Int)
|
||
tmp1.Mul(idx, j)
|
||
answer.Add(answer, tmp1)
|
||
j.Mul(j, big.NewInt(58))
|
||
}
|
||
// Handle leading zeros
|
||
res := answer.Bytes()
|
||
for i := 0; i < len(input) && input[i] == alphabet[0]; i++ {
|
||
res = append([]byte{0}, res...)
|
||
}
|
||
return res, nil
|
||
}
|