Use staggered exponential backoff when retrying in PersistentEndpointClientConnector

There are suspicions that the straight 5s retry could have caused a
situation a few days ago for `osu-server-spectator` wherein it was
getting hammered by constant retry requests. This should make that
a little less likely to happen.

Numbers chosen are arbitrary, but mostly follow stable's bancho retry
intervals because why not. Stable also skips the exponential backoff in
case of errors it considers transient, but I decided not to bother for
now.

Starts off from 3 seconds, then ramps up to up to 2 minutes. Added
stagger factor is 25% of duration, either direction. The stagger factor
helps given that if spectator server is dead, each client has three
separate connections to it which it will retry on (one to each hub).
This commit is contained in:
Bartłomiej Dach 2024-02-06 14:48:49 +01:00
parent d784934bce
commit 6ffe8e1713
No known key found for this signature in database

View File

@ -7,6 +7,7 @@ using System.Threading.Tasks;
using osu.Framework.Bindables;
using osu.Framework.Extensions.TypeExtensions;
using osu.Framework.Logging;
using osu.Framework.Utils;
using osu.Game.Online.API;
namespace osu.Game.Online
@ -31,6 +32,12 @@ namespace osu.Game.Online
private CancellationTokenSource connectCancelSource = new CancellationTokenSource();
private bool started;
/// <summary>
/// How much to delay before attempting to connect again, in milliseconds.
/// Subject to exponential back-off.
/// </summary>
private int retryDelay = 3000;
/// <summary>
/// Constructs a new <see cref="PersistentEndpointClientConnector"/>.
/// </summary>
@ -78,6 +85,8 @@ namespace osu.Game.Online
private async Task connect()
{
cancelExistingConnect();
// reset retry delay to default.
retryDelay = 3000;
if (!await connectionLock.WaitAsync(10000).ConfigureAwait(false))
throw new TimeoutException("Could not obtain a lock to connect. A previous attempt is likely stuck.");
@ -134,8 +143,15 @@ namespace osu.Game.Online
/// </summary>
private async Task handleErrorAndDelay(Exception exception, CancellationToken cancellationToken)
{
Logger.Log($"{ClientName} connect attempt failed: {exception.Message}", LoggingTarget.Network);
await Task.Delay(5000, cancellationToken).ConfigureAwait(false);
// random stagger factor to avoid mass incidental synchronisation
// compare: https://github.com/peppy/osu-stable-reference/blob/013c3010a9d495e3471a9c59518de17006f9ad89/osu!/Online/BanchoClient.cs#L331
int thisDelay = (int)(retryDelay * RNG.NextDouble(0.75, 1.25));
// exponential backoff with upper limit
// compare: https://github.com/peppy/osu-stable-reference/blob/013c3010a9d495e3471a9c59518de17006f9ad89/osu!/Online/BanchoClient.cs#L539
retryDelay = Math.Min(120000, (int)(retryDelay * 1.5));
Logger.Log($"{ClientName} connect attempt failed: {exception.Message}. Next attempt in {thisDelay / 1000:N0} seconds.", LoggingTarget.Network);
await Task.Delay(thisDelay, cancellationToken).ConfigureAwait(false);
}
/// <summary>