Handle garbage-collected access tokens in the refresh token logic
We check if the access token was used when a double-refresh happened, but can't do that reliably as we started garbage-collecting expired access tokens
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
// Copyright 2025, 2026 Element Creations Ltd.
|
||||
// Copyright 2024, 2025 New Vector Ltd.
|
||||
// Copyright 2021-2024 The Matrix.org Foundation C.I.C.
|
||||
//
|
||||
@@ -141,9 +142,6 @@ pub(crate) enum RouteError {
|
||||
refresh_token: Ulid,
|
||||
},
|
||||
|
||||
#[error("no access token associated with the refresh token {refresh_token:?}")]
|
||||
NoAccessTokenOnRefreshToken { refresh_token: Ulid },
|
||||
|
||||
#[error("device code grant expired")]
|
||||
DeviceCodeExpired,
|
||||
|
||||
@@ -171,7 +169,6 @@ impl IntoResponse for RouteError {
|
||||
| Self::ProvisionDeviceFailed(_)
|
||||
| Self::NoSuchNextRefreshToken { .. }
|
||||
| Self::NoSuchNextAccessToken { .. }
|
||||
| Self::NoAccessTokenOnRefreshToken { .. }
|
||||
);
|
||||
|
||||
TOKEN_REQUEST_COUNTER.add(1, &[KeyValue::new(RESULT, "error")]);
|
||||
@@ -183,8 +180,7 @@ impl IntoResponse for RouteError {
|
||||
| Self::NoSuchOAuthSession(_)
|
||||
| Self::ProvisionDeviceFailed(_)
|
||||
| Self::NoSuchNextRefreshToken { .. }
|
||||
| Self::NoSuchNextAccessToken { .. }
|
||||
| Self::NoAccessTokenOnRefreshToken { .. } => (
|
||||
| Self::NoSuchNextAccessToken { .. } => (
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(ClientError::from(ClientErrorCode::ServerError)),
|
||||
),
|
||||
@@ -678,28 +674,52 @@ async fn refresh_token_grant(
|
||||
return Err(RouteError::RefreshTokenInvalid(next_refresh_token.id));
|
||||
}
|
||||
|
||||
// Check if the associated access token was already used
|
||||
let Some(access_token_id) = next_refresh_token.access_token_id else {
|
||||
// This should in theory not happen: this means an access token got cleaned up,
|
||||
// but the refresh token was still valid.
|
||||
return Err(RouteError::NoAccessTokenOnRefreshToken {
|
||||
refresh_token: next_refresh_token.id,
|
||||
});
|
||||
};
|
||||
// Check if the associated access token was already used.
|
||||
//
|
||||
// If the access token is no longer present, we assume it was *not* used.
|
||||
// Tokens can disappear for two main reasons:
|
||||
//
|
||||
// - revoked access tokens are deleted after 1 hour
|
||||
// - expired access tokens are deleted after 30 days
|
||||
//
|
||||
// Revoked tokens are not an issue, as the associated refresh token is also
|
||||
// revoked. For expired tokens, however, we are effectively losing the
|
||||
// ability to prevent the client from performing a bad double-refresh.
|
||||
// This measure is intended to enhance security when a refresh token
|
||||
// leaks. However, the primary goal is to ensure that we do not maintain
|
||||
// two active branches of the refresh token tree.
|
||||
//
|
||||
// Consider these two scenarios:
|
||||
//
|
||||
// - Refresh token A is consumed, issuing refresh token B and access token C.
|
||||
// - The client uses access token C.
|
||||
// - Access token C expires after some time.
|
||||
// - If the client then attempts to use refresh token A again:
|
||||
// - If access token C is still present, the refresh will be rightfully
|
||||
// declined, as we have proof that it received the new set of tokens.
|
||||
// - If access token C was cleaned up, the refresh will succeed, issuing
|
||||
// new tokens but invalidating refresh token B and the original access
|
||||
// token C.
|
||||
if let Some(access_token_id) = next_refresh_token.access_token_id {
|
||||
// Load it
|
||||
let next_access_token = repo
|
||||
.oauth2_access_token()
|
||||
.lookup(access_token_id)
|
||||
.await?
|
||||
.ok_or(RouteError::NoSuchNextAccessToken {
|
||||
access_token: access_token_id,
|
||||
refresh_token: next_refresh_token_id,
|
||||
})?;
|
||||
|
||||
// Load it
|
||||
let next_access_token = repo
|
||||
.oauth2_access_token()
|
||||
.lookup(access_token_id)
|
||||
.await?
|
||||
.ok_or(RouteError::NoSuchNextAccessToken {
|
||||
access_token: access_token_id,
|
||||
refresh_token: next_refresh_token_id,
|
||||
})?;
|
||||
if next_access_token.is_used() {
|
||||
// XXX: This is a replay, we *may* want to invalidate the session
|
||||
return Err(RouteError::RefreshTokenInvalid(next_refresh_token.id));
|
||||
}
|
||||
|
||||
if next_access_token.is_used() {
|
||||
// XXX: This is a replay, we *may* want to invalidate the session
|
||||
return Err(RouteError::RefreshTokenInvalid(next_refresh_token.id));
|
||||
// This could be a double-refresh, see bellow
|
||||
repo.oauth2_access_token()
|
||||
.revoke(clock, next_access_token)
|
||||
.await?;
|
||||
}
|
||||
|
||||
// Looks like it's a double-refresh, client lost their refresh token on
|
||||
@@ -712,10 +732,6 @@ async fn refresh_token_grant(
|
||||
"Refresh token already used, but issued refresh and access tokens are unused. Assuming those were lost; revoking those and reissuing new ones."
|
||||
);
|
||||
|
||||
repo.oauth2_access_token()
|
||||
.revoke(clock, next_access_token)
|
||||
.await?;
|
||||
|
||||
repo.oauth2_refresh_token()
|
||||
.revoke(clock, next_refresh_token)
|
||||
.await?;
|
||||
@@ -1479,6 +1495,7 @@ mod tests {
|
||||
|
||||
let fifth_response = state.request(request).await;
|
||||
fifth_response.assert_status(StatusCode::OK);
|
||||
let fifth_response: AccessTokenResponse = fifth_response.json();
|
||||
|
||||
// But now, if we re-do with the second_response.refresh_token, it should
|
||||
// fail
|
||||
@@ -1491,6 +1508,48 @@ mod tests {
|
||||
|
||||
let sixth_response = state.request(request).await;
|
||||
sixth_response.assert_status(StatusCode::BAD_REQUEST);
|
||||
|
||||
// One edge-case scenario: after 30 days, expired access tokens are
|
||||
// deleted, so we can't track accurately if the refresh successful or
|
||||
// not. In this case we chose to allow the refresh to succeed to avoid
|
||||
// spuriously logging out the user.
|
||||
|
||||
// Make sure to mark the fifth access token as used
|
||||
assert!(
|
||||
state
|
||||
.is_access_token_valid(&fifth_response.access_token)
|
||||
.await
|
||||
);
|
||||
|
||||
// Make sure to run all the cleanup tasks
|
||||
// We run the job queue once before advancing the clock to make sure the
|
||||
// scheduled jobs get scheduled to a time before we advanced the clock
|
||||
state.run_jobs_in_queue().await;
|
||||
state.clock.advance(Duration::days(31));
|
||||
state.run_jobs_in_queue().await;
|
||||
|
||||
// We're not supposed to be able to use the fourth refresh token, but here we
|
||||
// are
|
||||
let request =
|
||||
Request::post(mas_router::OAuth2TokenEndpoint::PATH).form(serde_json::json!({
|
||||
"grant_type": "refresh_token",
|
||||
"refresh_token": fourth_response.refresh_token,
|
||||
"client_id": client.client_id,
|
||||
}));
|
||||
|
||||
let seventh_response = state.request(request).await;
|
||||
seventh_response.assert_status(StatusCode::OK);
|
||||
|
||||
// And the refresh token we had on the fifth response should now be invalid
|
||||
let request =
|
||||
Request::post(mas_router::OAuth2TokenEndpoint::PATH).form(serde_json::json!({
|
||||
"grant_type": "refresh_token",
|
||||
"refresh_token": fifth_response.refresh_token,
|
||||
"client_id": client.client_id,
|
||||
}));
|
||||
|
||||
let eighth_response = state.request(request).await;
|
||||
eighth_response.assert_status(StatusCode::BAD_REQUEST);
|
||||
}
|
||||
|
||||
#[sqlx::test(migrator = "mas_storage_pg::MIGRATOR")]
|
||||
|
||||
@@ -72,10 +72,14 @@ impl RunnableJob for CleanupRevokedOAuthAccessTokensJob {
|
||||
impl RunnableJob for CleanupExpiredOAuthAccessTokensJob {
|
||||
#[tracing::instrument(name = "job.cleanup_expired_oauth_access_tokens", skip_all)]
|
||||
async fn run(&self, state: &State, context: JobContext) -> Result<(), JobError> {
|
||||
// Cleanup tokens that expired more than a week ago
|
||||
// We keep expired tokens around longer than revoked tokens, so that we
|
||||
// can possibly give a nice 'token expired' error if that token is used
|
||||
let until = state.clock.now() - chrono::Duration::weeks(1);
|
||||
// Cleanup tokens that expired more than a month ago
|
||||
// It is important to keep them around for a bit because of refresh
|
||||
// token idempotency. When we see a refresh token twice, we allow
|
||||
// reusing it *only* if both the next refresh token and the next access
|
||||
// tokens were not used. By keeping expired access tokens around for a
|
||||
// month, we cannot make the *correct* decision, we will assume that the
|
||||
// token wasn't used. Refer to the token refresh logic for details.
|
||||
let until = state.clock.now() - chrono::Duration::days(30);
|
||||
let mut total = 0;
|
||||
|
||||
// Run until we get cancelled. We don't schedule a retry if we get cancelled, as
|
||||
|
||||
Reference in New Issue
Block a user