Reduce shard health report complexity

A lot of these things were designed with the thought in mind that
there used to be many, many issues when running the bot on a large
number of servers. The reason for these bugs turned out to be simple:
the hardware was not fast enough to keep up with everything.

This extra reporting and these extra behaviors have not been
necessary for some time, and it is best to remove them.
This commit is contained in:
Noi 2022-09-03 18:04:55 -07:00
parent 854cd0626b
commit f4d5bfecd0
4 changed files with 45 additions and 100 deletions

View file

@ -14,7 +14,6 @@ class Configuration {
public string BotToken { get; } public string BotToken { get; }
public string? DBotsToken { get; } public string? DBotsToken { get; }
public bool QuitOnFails { get; }
public int ShardStart { get; } public int ShardStart { get; }
public int ShardAmount { get; } public int ShardAmount { get; }
@ -46,7 +45,6 @@ class Configuration {
BotToken = ReadConfKey<string>(jc, nameof(BotToken), true); BotToken = ReadConfKey<string>(jc, nameof(BotToken), true);
DBotsToken = ReadConfKey<string>(jc, nameof(DBotsToken), false); DBotsToken = ReadConfKey<string>(jc, nameof(DBotsToken), false);
QuitOnFails = ReadConfKey<bool?>(jc, nameof(QuitOnFails), false) ?? false;
ShardTotal = args.ShardTotal ?? ReadConfKey<int?>(jc, nameof(ShardTotal), false) ?? 1; ShardTotal = args.ShardTotal ?? ReadConfKey<int?>(jc, nameof(ShardTotal), false) ?? 1;
if (ShardTotal < 1) throw new Exception($"'{nameof(ShardTotal)}' must be a positive integer."); if (ShardTotal < 1) throw new Exception($"'{nameof(ShardTotal)}' must be a positive integer.");

View file

@ -43,13 +43,14 @@ class Program {
public static void ProgramStop() { public static void ProgramStop() {
if (_stopping) return; if (_stopping) return;
_stopping = true; _stopping = true;
Log("Shutdown", "Commencing shutdown..."); Log(nameof(Program), "Shutting down...");
var dispose = Task.Run(_bot!.Dispose); var dispose = Task.Run(_bot!.Dispose);
if (!dispose.Wait(90000)) { if (!dispose.Wait(30000)) {
Log("Shutdown", "Normal shutdown has not concluded after 90 seconds. Will force quit."); Log(nameof(Program), "Disconnection is taking too long. Will force exit.");
Environment.ExitCode &= (int)ExitCodes.ForcedExit; Environment.ExitCode &= (int)ExitCodes.ForcedExit;
} }
Log(nameof(Program), $"Uptime: {BotUptime}");
Environment.Exit(Environment.ExitCode); Environment.Exit(Environment.ExitCode);
} }

View file

@ -46,7 +46,6 @@ public sealed class ShardInstance : IDisposable {
// Background task constructor begins background processing immediately. // Background task constructor begins background processing immediately.
_background = new ShardBackgroundWorker(this); _background = new ShardBackgroundWorker(this);
Log(nameof(ShardInstance), "Instance created.");
} }
/// <summary> /// <summary>
@ -66,7 +65,6 @@ public sealed class ShardInstance : IDisposable {
DiscordClient.LogoutAsync().Wait(5000); DiscordClient.LogoutAsync().Wait(5000);
DiscordClient.Dispose(); DiscordClient.Dispose();
_interactionService.Dispose(); _interactionService.Dispose();
Log(nameof(ShardInstance), "Instance disposed.");
} }
internal void Log(string source, string message) => Program.Log($"Shard {ShardId:00}] [{source}", message); internal void Log(string source, string message) => Program.Log($"Shard {ShardId:00}] [{source}", message);

View file

@ -14,12 +14,7 @@ class ShardManager : IDisposable {
/// <summary> /// <summary>
/// Number of seconds between each time the status task runs, in seconds. /// Number of seconds between each time the status task runs, in seconds.
/// </summary> /// </summary>
private const int StatusInterval = 60; private const int StatusInterval = 90;
/// <summary>
/// Number of shards allowed to be destroyed before the program may close itself, if configured.
/// </summary>
private const int MaxDestroyedShards = 10; // TODO make configurable
/// <summary> /// <summary>
/// Number of concurrent shard startups to happen on each check. /// Number of concurrent shard startups to happen on each check.
@ -29,8 +24,7 @@ class ShardManager : IDisposable {
/// <summary> /// <summary>
/// Amount of time without a completed background service run before a shard instance /// Amount of time without a completed background service run before a shard instance
/// is considered "dead" and tasked to be removed. A fraction of this value is also used /// is considered "dead" and tasked to be removed.
/// to determine when a shard is "slow".
/// </summary> /// </summary>
private static readonly TimeSpan DeadShardThreshold = new(0, 20, 0); private static readonly TimeSpan DeadShardThreshold = new(0, 20, 0);
@ -44,7 +38,6 @@ class ShardManager : IDisposable {
private readonly Task _statusTask; private readonly Task _statusTask;
private readonly CancellationTokenSource _mainCancel; private readonly CancellationTokenSource _mainCancel;
private int _destroyedShards = 0;
internal Configuration Config { get; } internal Configuration Config { get; }
@ -63,7 +56,7 @@ class ShardManager : IDisposable {
// Start status reporting thread // Start status reporting thread
_mainCancel = new CancellationTokenSource(); _mainCancel = new CancellationTokenSource();
_statusTask = Task.Factory.StartNew(StatusLoop, _mainCancel.Token, _statusTask = Task.Factory.StartNew(StatusLoop, _mainCancel.Token,
TaskCreationOptions.LongRunning, TaskScheduler.Default); TaskCreationOptions.LongRunning, TaskScheduler.Default);
} }
public void Dispose() { public void Dispose() {
@ -91,8 +84,6 @@ class ShardManager : IDisposable {
/// Creates and sets up a new shard instance. /// Creates and sets up a new shard instance.
/// </summary> /// </summary>
private async Task<ShardInstance> InitializeShard(int shardId) { private async Task<ShardInstance> InitializeShard(int shardId) {
ShardInstance newInstance;
var clientConf = new DiscordSocketConfig() { var clientConf = new DiscordSocketConfig() {
ShardId = shardId, ShardId = shardId,
TotalShards = Config.ShardTotal, TotalShards = Config.ShardTotal,
@ -107,8 +98,8 @@ class ShardManager : IDisposable {
.AddSingleton(s => new DiscordSocketClient(clientConf)) .AddSingleton(s => new DiscordSocketClient(clientConf))
.AddSingleton(s => new InteractionService(s.GetRequiredService<DiscordSocketClient>())) .AddSingleton(s => new InteractionService(s.GetRequiredService<DiscordSocketClient>()))
.BuildServiceProvider(); .BuildServiceProvider();
newInstance = services.GetRequiredService<ShardInstance>(); var newInstance = services.GetRequiredService<ShardInstance>();
await newInstance.StartAsync().ConfigureAwait(false); await newInstance.StartAsync();
return newInstance; return newInstance;
} }
@ -121,105 +112,62 @@ class ShardManager : IDisposable {
return null; return null;
} }
#region Status checking and display
private struct GuildStatusData {
public int GuildCount;
public TimeSpan LastTaskRunTime;
public string? ExecutingTask;
}
private static string StatusDisplay(IEnumerable<int> guildList, Dictionary<int, GuildStatusData> guildInfo, bool showDetail) {
if (!guildList.Any()) return "--";
var result = new StringBuilder();
foreach (var item in guildList) {
result.Append(item.ToString("00") + " ");
if (showDetail) {
result.Remove(result.Length - 1, 1);
result.Append($"[{Math.Floor(guildInfo[item].LastTaskRunTime.TotalSeconds):000}s");
if (guildInfo[item].ExecutingTask != null)
result.Append($" {guildInfo[item].ExecutingTask}");
result.Append("] ");
}
}
if (result.Length > 0) result.Remove(result.Length - 1, 1);
return result.ToString();
}
private async Task StatusLoop() { private async Task StatusLoop() {
try { try {
while (!_mainCancel.IsCancellationRequested) { while (!_mainCancel.IsCancellationRequested) {
Log($"Bot uptime: {Program.BotUptime}"); Log($"Uptime: {Program.BotUptime}");
// Iterate through shard list, extract data // Iterate through shards, create report on each
var guildInfo = new Dictionary<int, GuildStatusData>(); var shardStatuses = new StringBuilder();
var now = DateTimeOffset.UtcNow;
var nullShards = new List<int>(); var nullShards = new List<int>();
foreach (var item in _shards) { var deadShards = new List<int>();
if (item.Value == null) { for (var i = 0; i < _shards.Count; i++) {
nullShards.Add(item.Key); shardStatuses.Append($"Shard {i:00}: ");
if (_shards[i] == null) {
shardStatuses.AppendLine("Inactive.");
nullShards.Add(i);
continue; continue;
} }
var shard = item.Value;
guildInfo[item.Key] = new GuildStatusData {
GuildCount = shard.DiscordClient.Guilds.Count,
LastTaskRunTime = now - shard.LastBackgroundRun,
ExecutingTask = shard.CurrentExecutingService
};
}
// Process info
var guildCounts = guildInfo.Select(i => i.Value.GuildCount);
var guildTotal = guildCounts.Sum();
var guildAverage = guildCounts.Any() ? guildCounts.Average() : 0;
Log($"Currently in {guildTotal} guilds. Average shard load: {guildAverage:0.0}.");
// Health report
var goodShards = new List<int>();
var badShards = new List<int>(); // shards with low connection score OR long time since last work
var deadShards = new List<int>(); // shards to destroy and reinitialize
foreach (var item in guildInfo) {
var lastRun = item.Value.LastTaskRunTime;
var shard = _shards[i]!;
var client = shard.DiscordClient;
shardStatuses.Append($"{Enum.GetName(typeof(ConnectionState), client.ConnectionState)} ({client.Latency:000}ms).");
shardStatuses.Append($" Guilds: {client.Guilds.Count}.");
shardStatuses.Append($" Background: {shard.CurrentExecutingService ?? "Idle"}");
var lastRun = DateTimeOffset.UtcNow - shard.LastBackgroundRun;
if (lastRun > DeadShardThreshold / 3) { if (lastRun > DeadShardThreshold / 3) {
badShards.Add(item.Key); // Formerly known as a 'slow' shard
shardStatuses.Append($", heartbeat {Math.Floor(lastRun.TotalMinutes):00}m ago.");
// Consider a shard dead after a long span without background activity
if (lastRun > DeadShardThreshold)
deadShards.Add(item.Key);
} else { } else {
goodShards.Add(item.Key); shardStatuses.Append('.');
}
shardStatuses.AppendLine();
if (lastRun > DeadShardThreshold) {
shardStatuses.AppendLine($"Shard {i:00} marked for disposal.");
deadShards.Add(i);
} }
} }
Log("Online: " + StatusDisplay(goodShards, guildInfo, false)); Log(shardStatuses.ToString().TrimEnd());
if (badShards.Count > 0) Log("Slow: " + StatusDisplay(badShards, guildInfo, true));
if (deadShards.Count > 0) Log("Dead: " + StatusDisplay(deadShards, guildInfo, false));
if (nullShards.Count > 0) Log("Offline: " + StatusDisplay(nullShards, guildInfo, false));
// Remove dead shards // Remove dead shards
foreach (var dead in deadShards) { foreach (var dead in deadShards) {
_shards[dead]!.Dispose(); _shards[dead]!.Dispose();
_shards[dead] = null; _shards[dead] = null;
_destroyedShards++;
}
if (Config.QuitOnFails && _destroyedShards > MaxDestroyedShards) {
Environment.ExitCode = (int)Program.ExitCodes.DeadShardThreshold;
Program.ProgramStop();
} else {
// Start up any missing shards
var startAllowance = MaxConcurrentOperations;
foreach (var id in nullShards) {
// To avoid possible issues with resources strained over so many shards starting at once,
// initialization is spread out by only starting a few at a time.
if (startAllowance-- > 0) {
_shards[id] = await InitializeShard(id).ConfigureAwait(false);
} else break;
}
} }
await Task.Delay(StatusInterval * 1000, _mainCancel.Token).ConfigureAwait(false); // Start null shards, a few at at time
var startAllowance = MaxConcurrentOperations;
foreach (var id in nullShards) {
if (startAllowance-- > 0) {
_shards[id] = await InitializeShard(id);
} else break;
}
await Task.Delay(StatusInterval * 1000, _mainCancel.Token);
} }
} catch (TaskCanceledException) { } } catch (TaskCanceledException) { }
} }
#endregion
} }