Handle garbage-collected access tokens in the refresh token logic

We check if the access token was used when a double-refresh happened, but can't do that reliably as we started garbage-collecting expired access tokens
2026-01-09 18:09:05 +01:00
parent 991c60255e
commit 7aad841e04
2 changed files with 97 additions and 34 deletions
--- a/crates/handlers/src/oauth2/token.rs
+++ b/crates/handlers/src/oauth2/token.rs
@@ -1,3 +1,4 @@
+// Copyright 2025, 2026 Element Creations Ltd.
 // Copyright 2024, 2025 New Vector Ltd.
 // Copyright 2021-2024 The Matrix.org Foundation C.I.C.
 //
@@ -141,9 +142,6 @@ pub(crate) enum RouteError {
        refresh_token: Ulid,
    },

-    #[error("no access token associated with the refresh token {refresh_token:?}")]
-    NoAccessTokenOnRefreshToken { refresh_token: Ulid },
-
    #[error("device code grant expired")]
    DeviceCodeExpired,

@@ -171,7 +169,6 @@ impl IntoResponse for RouteError {
                | Self::ProvisionDeviceFailed(_)
                | Self::NoSuchNextRefreshToken { .. }
                | Self::NoSuchNextAccessToken { .. }
-                | Self::NoAccessTokenOnRefreshToken { .. }
        );

        TOKEN_REQUEST_COUNTER.add(1, &[KeyValue::new(RESULT, "error")]);
@@ -183,8 +180,7 @@ impl IntoResponse for RouteError {
            | Self::NoSuchOAuthSession(_)
            | Self::ProvisionDeviceFailed(_)
            | Self::NoSuchNextRefreshToken { .. }
-            | Self::NoSuchNextAccessToken { .. }
-            | Self::NoAccessTokenOnRefreshToken { .. } => (
+            | Self::NoSuchNextAccessToken { .. } => (
                StatusCode::INTERNAL_SERVER_ERROR,
                Json(ClientError::from(ClientErrorCode::ServerError)),
            ),
@@ -678,28 +674,52 @@ async fn refresh_token_grant(
            return Err(RouteError::RefreshTokenInvalid(next_refresh_token.id));
        }

-        // Check if the associated access token was already used
-        let Some(access_token_id) = next_refresh_token.access_token_id else {
-            // This should in theory not happen: this means an access token got cleaned up,
-            // but the refresh token was still valid.
-            return Err(RouteError::NoAccessTokenOnRefreshToken {
-                refresh_token: next_refresh_token.id,
-            });
-        };
+        // Check if the associated access token was already used.
+        //
+        // If the access token is no longer present, we assume it was *not* used.
+        // Tokens can disappear for two main reasons:
+        //
+        //  - revoked access tokens are deleted after 1 hour
+        //  - expired access tokens are deleted after 30 days
+        //
+        // Revoked tokens are not an issue, as the associated refresh token is also
+        // revoked. For expired tokens, however, we are effectively losing the
+        // ability to prevent the client from performing a bad double-refresh.
+        // This measure is intended to enhance security when a refresh token
+        // leaks. However, the primary goal is to ensure that we do not maintain
+        // two active branches of the refresh token tree.
+        //
+        // Consider these two scenarios:
+        //
+        //   - Refresh token A is consumed, issuing refresh token B and access token C.
+        //   - The client uses access token C.
+        //   - Access token C expires after some time.
+        //   - If the client then attempts to use refresh token A again:
+        //      - If access token C is still present, the refresh will be rightfully
+        //        declined, as we have proof that it received the new set of tokens.
+        //      - If access token C was cleaned up, the refresh will succeed, issuing
+        //        new tokens but invalidating refresh token B and the original access
+        //        token C.
+        if let Some(access_token_id) = next_refresh_token.access_token_id {
+            // Load it
+            let next_access_token = repo
+                .oauth2_access_token()
+                .lookup(access_token_id)
+                .await?
+                .ok_or(RouteError::NoSuchNextAccessToken {
+                    access_token: access_token_id,
+                    refresh_token: next_refresh_token_id,
+                })?;

-        // Load it
-        let next_access_token = repo
-            .oauth2_access_token()
-            .lookup(access_token_id)
-            .await?
-            .ok_or(RouteError::NoSuchNextAccessToken {
-                access_token: access_token_id,
-                refresh_token: next_refresh_token_id,
-            })?;
+            if next_access_token.is_used() {
+                // XXX: This is a replay, we *may* want to invalidate the session
+                return Err(RouteError::RefreshTokenInvalid(next_refresh_token.id));
+            }

-        if next_access_token.is_used() {
-            // XXX: This is a replay, we *may* want to invalidate the session
-            return Err(RouteError::RefreshTokenInvalid(next_refresh_token.id));
+            // This could be a double-refresh, see bellow
+            repo.oauth2_access_token()
+                .revoke(clock, next_access_token)
+                .await?;
        }

        // Looks like it's a double-refresh, client lost their refresh token on
@@ -712,10 +732,6 @@ async fn refresh_token_grant(
            "Refresh token already used, but issued refresh and access tokens are unused. Assuming those were lost; revoking those and reissuing new ones."
        );

-        repo.oauth2_access_token()
-            .revoke(clock, next_access_token)
-            .await?;
-
        repo.oauth2_refresh_token()
            .revoke(clock, next_refresh_token)
            .await?;
@@ -1479,6 +1495,7 @@ mod tests {

        let fifth_response = state.request(request).await;
        fifth_response.assert_status(StatusCode::OK);
+        let fifth_response: AccessTokenResponse = fifth_response.json();

        // But now, if we re-do with the second_response.refresh_token, it should
        // fail
@@ -1491,6 +1508,48 @@ mod tests {

        let sixth_response = state.request(request).await;
        sixth_response.assert_status(StatusCode::BAD_REQUEST);
+
+        // One edge-case scenario: after 30 days, expired access tokens are
+        // deleted, so we can't track accurately if the refresh successful or
+        // not. In this case we chose to allow the refresh to succeed to avoid
+        // spuriously logging out the user.
+
+        // Make sure to mark the fifth access token as used
+        assert!(
+            state
+                .is_access_token_valid(&fifth_response.access_token)
+                .await
+        );
+
+        // Make sure to run all the cleanup tasks
+        // We run the job queue once before advancing the clock to make sure the
+        // scheduled jobs get scheduled to a time before we advanced the clock
+        state.run_jobs_in_queue().await;
+        state.clock.advance(Duration::days(31));
+        state.run_jobs_in_queue().await;
+
+        // We're not supposed to be able to use the fourth refresh token, but here we
+        // are
+        let request =
+            Request::post(mas_router::OAuth2TokenEndpoint::PATH).form(serde_json::json!({
+                "grant_type": "refresh_token",
+                "refresh_token": fourth_response.refresh_token,
+                "client_id": client.client_id,
+            }));
+
+        let seventh_response = state.request(request).await;
+        seventh_response.assert_status(StatusCode::OK);
+
+        // And the refresh token we had on the fifth response should now be invalid
+        let request =
+            Request::post(mas_router::OAuth2TokenEndpoint::PATH).form(serde_json::json!({
+                "grant_type": "refresh_token",
+                "refresh_token": fifth_response.refresh_token,
+                "client_id": client.client_id,
+            }));
+
+        let eighth_response = state.request(request).await;
+        eighth_response.assert_status(StatusCode::BAD_REQUEST);
    }

    #[sqlx::test(migrator = "mas_storage_pg::MIGRATOR")]
--- a/crates/tasks/src/database.rs
+++ b/crates/tasks/src/database.rs
@@ -72,10 +72,14 @@ impl RunnableJob for CleanupRevokedOAuthAccessTokensJob {
 impl RunnableJob for CleanupExpiredOAuthAccessTokensJob {
    #[tracing::instrument(name = "job.cleanup_expired_oauth_access_tokens", skip_all)]
    async fn run(&self, state: &State, context: JobContext) -> Result<(), JobError> {
-        // Cleanup tokens that expired more than a week ago
-        // We keep expired tokens around longer than revoked tokens, so that we
-        // can possibly give a nice 'token expired' error if that token is used
-        let until = state.clock.now() - chrono::Duration::weeks(1);
+        // Cleanup tokens that expired more than a month ago
+        // It is important to keep them around for a bit because of refresh
+        // token idempotency. When we see a refresh token twice, we allow
+        // reusing it *only* if both the next refresh token and the next access
+        // tokens were not used. By keeping expired access tokens around for a
+        // month, we cannot make the *correct* decision, we will assume that the
+        // token wasn't used. Refer to the token refresh logic for details.
+        let until = state.clock.now() - chrono::Duration::days(30);
        let mut total = 0;

        // Run until we get cancelled. We don't schedule a retry if we get cancelled, as