Automatically identify and remove dead shards

- Manager loop updated to act on dead shards and initialize new ones
- Initialization of new shards is now staggered in case of many shards
- Now using intents
- Adding a delay and a second connection check when downloading guild
  members. This has greatly improved stability over the previously
  attempted method.
- No longer sends guild counts to external sites until fully connected
This commit is contained in:
Noi 2020-10-29 00:44:48 -07:00
parent a631f55a37
commit 3c73be3ee7
2 changed files with 99 additions and 66 deletions

View file

@ -27,6 +27,12 @@ namespace BirthdayBot.BackgroundServices
var exs = new List<Exception>(); var exs = new List<Exception>();
foreach (var guild in ShardInstance.DiscordClient.Guilds) foreach (var guild in ShardInstance.DiscordClient.Guilds)
{ {
if (ShardInstance.DiscordClient.ConnectionState != Discord.ConnectionState.Connected)
{
Log("Client is not connected. Stopping early.");
return;
}
// Single guilds are fully processed and are not interrupted by task cancellation. // Single guilds are fully processed and are not interrupted by task cancellation.
if (token.IsCancellationRequested) throw new TaskCanceledException(); if (token.IsCancellationRequested) throw new TaskCanceledException();
try try
@ -68,9 +74,11 @@ namespace BirthdayBot.BackgroundServices
if (diag.RoleCheck != null) return diag; if (diag.RoleCheck != null) return diag;
// Determine who's currently having a birthday // Determine who's currently having a birthday
// Note: This is where we'd call DownloadUsersAsync, but this method is capable of blocking indefinitely if (!guild.HasAllMembers)
// and making the task completely unresponsive. Must investigate further before calling it here and disabling {
// AlwaysDownloadUsers in client settings. await guild.DownloadUsersAsync().ConfigureAwait(false);
await Task.Delay(500);
}
var users = await GuildUserConfiguration.LoadAllAsync(guild.Id).ConfigureAwait(false); var users = await GuildUserConfiguration.LoadAllAsync(guild.Id).ConfigureAwait(false);
var tz = gc.TimeZone; var tz = gc.TimeZone;
var birthdays = GetGuildCurrentBirthdays(users, tz); var birthdays = GetGuildCurrentBirthdays(users, tz);

View file

@ -21,9 +21,17 @@ namespace BirthdayBot
class ShardManager : IDisposable class ShardManager : IDisposable
{ {
/// <summary> /// <summary>
/// Array indexes correspond to shard IDs. Lock on itself when modifying. /// Number of seconds between each time the manager's watchdog task runs, in seconds.
/// </summary> /// </summary>
private readonly ShardInstance[] _shards; private const int WatchdogInterval = 90;
/// <summary>
/// A dictionary with shard IDs as its keys and shard instances as its values.
/// When initialized, all keys will be created as configured. If an instance is removed,
/// a key's corresponding value will temporarily become null instead of the key/value
/// pair being removed.
/// </summary>
private readonly Dictionary<int, ShardInstance> _shards;
// Commonly used command handler instances // Commonly used command handler instances
private readonly Dictionary<string, CommandHandler> _dispatchCommands; private readonly Dictionary<string, CommandHandler> _dispatchCommands;
@ -35,6 +43,7 @@ namespace BirthdayBot
// Watchdog stuff // Watchdog stuff
private readonly Task _watchdogTask; private readonly Task _watchdogTask;
private readonly CancellationTokenSource _watchdogCancel; private readonly CancellationTokenSource _watchdogCancel;
private int _destroyedShards = 0;
internal Configuration Config { get; } internal Configuration Config { get; }
@ -56,10 +65,12 @@ namespace BirthdayBot
_cmdsMods = new ManagerCommands(cfg, _cmdsUser.Commands); _cmdsMods = new ManagerCommands(cfg, _cmdsUser.Commands);
foreach (var item in _cmdsMods.Commands) _dispatchCommands.Add(item.Item1, item.Item2); foreach (var item in _cmdsMods.Commands) _dispatchCommands.Add(item.Item1, item.Item2);
// Start shards _shards = new Dictionary<int, ShardInstance>();
_shards = new ShardInstance[Config.ShardCount]; // TODO implement more flexible sharding configuration here
for (int i = 0; i < _shards.Length; i++) for (int i = 0; i < Config.ShardCount; i++)
InitializeShard(i).Wait(); {
_shards.Add(i, null);
}
// Start watchdog // Start watchdog
_watchdogCancel = new CancellationTokenSource(); _watchdogCancel = new CancellationTokenSource();
@ -77,10 +88,10 @@ namespace BirthdayBot
Log("Shutting down all shards..."); Log("Shutting down all shards...");
var shardDisposes = new List<Task>(); var shardDisposes = new List<Task>();
foreach (var shard in _shards) foreach (var item in _shards)
{ {
if (shard == null) continue; if (item.Value == null) continue;
shardDisposes.Add(Task.Run(shard.Dispose)); shardDisposes.Add(Task.Run(item.Value.Dispose));
} }
if (!Task.WhenAll(shardDisposes).Wait(60000)) if (!Task.WhenAll(shardDisposes).Wait(60000))
{ {
@ -95,28 +106,25 @@ namespace BirthdayBot
/// <summary> /// <summary>
/// Creates and sets up a new shard instance. /// Creates and sets up a new shard instance.
/// </summary> /// </summary>
private async Task InitializeShard(int shardId) private async Task<ShardInstance> InitializeShard(int shardId)
{ {
ShardInstance newInstance; ShardInstance newInstance;
lock (_shards)
{
var clientConf = new DiscordSocketConfig()
{
ShardId = shardId,
TotalShards = Config.ShardCount,
LogLevel = LogSeverity.Info,
DefaultRetryMode = RetryMode.RetryRatelimit,
MessageCacheSize = 0, // not needed at all
ExclusiveBulkDelete = true, // not relevant, but this is configured to skip the warning
AlwaysDownloadUsers = true, // TODO set to false when more stable to do so
GatewayIntents = GatewayIntents.Guilds | GatewayIntents.GuildMembers | GatewayIntents.GuildMessages
};
var newClient = new DiscordSocketClient(clientConf);
newInstance = new ShardInstance(this, newClient, _dispatchCommands);
_shards[shardId] = newInstance; var clientConf = new DiscordSocketConfig()
} {
ShardId = shardId,
TotalShards = Config.ShardCount,
LogLevel = LogSeverity.Info,
DefaultRetryMode = RetryMode.RetryRatelimit,
MessageCacheSize = 0, // not needed at all
ExclusiveBulkDelete = true, // not relevant, but this is configured to skip the warning
GatewayIntents = GatewayIntents.Guilds | GatewayIntents.GuildMembers | GatewayIntents.GuildMessages
};
var newClient = new DiscordSocketClient(clientConf);
newInstance = new ShardInstance(this, newClient, _dispatchCommands);
await newInstance.StartAsync().ConfigureAwait(false); await newInstance.StartAsync().ConfigureAwait(false);
return newInstance;
} }
private async Task WatchdogLoop() private async Task WatchdogLoop()
@ -127,59 +135,59 @@ namespace BirthdayBot
{ {
Log($"Bot uptime: {Common.BotUptime}"); Log($"Bot uptime: {Common.BotUptime}");
// Gather statistical information within the lock // Iterate through shard list, extract data
var guildInfo = new (int, int, TimeSpan)[_shards.Length]; // guild count, conn score, last run var guildInfo = new Dictionary<int, (int, int, TimeSpan)>();
var now = DateTimeOffset.UtcNow; var now = DateTimeOffset.UtcNow;
ulong? botId = null; ulong? botId = null;
lock (_shards) var nullShards = new List<int>();
foreach (var item in _shards)
{ {
for (int i = 0; i < _shards.Length; i++) if (item.Value == null)
{ {
var shard = _shards[i]; nullShards.Add(item.Key);
botId ??= shard.DiscordClient.CurrentUser?.Id; continue;
var guildCount = shard.DiscordClient.Guilds.Count;
var connScore = shard.ConnectionScore;
var lastRun = now - shard.LastBackgroundRun;
guildInfo[i] = (guildCount, connScore, lastRun);
} }
var shard = item.Value;
botId ??= shard.DiscordClient.CurrentUser?.Id;
var guildCount = shard.DiscordClient.Guilds.Count;
var connScore = shard.ConnectionScore;
var lastRun = now - shard.LastBackgroundRun;
guildInfo[item.Key] = (guildCount, connScore, lastRun);
} }
// Process info // Process info
var guildCounts = guildInfo.Select(i => i.Item1); var guildCounts = guildInfo.Select(i => i.Value.Item1);
var guildTotal = guildCounts.Sum(); var guildTotal = guildCounts.Sum();
var guildAverage = guildCounts.Average(); var guildAverage = guildCounts.Any() ? guildCounts.Average() : 0;
Log($"Currently in {guildTotal} guilds. Average shard load: {guildAverage:0.0}."); Log($"Currently in {guildTotal} guilds. Average shard load: {guildAverage:0.0}.");
if (botId.HasValue) if (nullShards.Count == 0 && botId.HasValue)
await SendExternalStatistics(guildTotal, botId.Value, _watchdogCancel.Token).ConfigureAwait(false); await SendExternalStatistics(guildTotal, botId.Value, _watchdogCancel.Token).ConfigureAwait(false);
// Health report // Health report
var goodShards = new List<int>(); var goodShards = new List<int>();
var badShards = new List<int>(); // shards with a low connection score / long time since last work var badShards = new List<int>(); // shards with low connection score OR long time since last work
var deadShards = new List<int>(); // shards to destroy and reinitialize var deadShards = new List<int>(); // shards to destroy and reinitialize
for (int i = 0; i < guildInfo.Length; i++) foreach (var item in guildInfo)
{ {
var connScore = guildInfo[i].Item2; var connScore = item.Value.Item2;
var lastRun = guildInfo[i].Item3; var lastRun = item.Value.Item3;
if (lastRun > new TimeSpan(0, 20, 0) || connScore < ConnectionStatus.StableScore) if (lastRun > new TimeSpan(0, 10, 0) || connScore < ConnectionStatus.StableScore)
{ {
badShards.Add(i); badShards.Add(item.Key);
// This is for now the only deciding factor on whether to discard a shard, // Consider a shard dead after a long span without background activity
// without regards to score. if (lastRun > new TimeSpan(0, 30, 0))
if (lastRun > new TimeSpan(1, 0, 0)) deadShards.Add(item.Key);
{
deadShards.Add(i);
}
} }
else else
{ {
goodShards.Add(i); goodShards.Add(item.Key);
} }
} }
string statusDisplay(IEnumerable<int> list, bool detailedInfo)
string catNumbers(IEnumerable<int> list, bool detailedInfo)
{ {
if (!list.Any()) return "--"; if (!list.Any()) return "--";
var result = new StringBuilder(); var result = new StringBuilder();
@ -193,20 +201,37 @@ namespace BirthdayBot
result.Append($" {Math.Floor(guildInfo[item].Item3.TotalMinutes):00}m"); result.Append($" {Math.Floor(guildInfo[item].Item3.TotalMinutes):00}m");
result.Append($"{guildInfo[item].Item3.Seconds:00}s] "); result.Append($"{guildInfo[item].Item3.Seconds:00}s] ");
} }
} }
if (result.Length > 0) result.Remove(result.Length - 1, 1); if (result.Length > 0) result.Remove(result.Length - 1, 1);
return result.ToString(); return result.ToString();
} }
Log("Stable shards: " + catNumbers(goodShards, false)); Log("Stable shards: " + statusDisplay(goodShards, false));
if (badShards.Count > 0) Log("Unstable shards: " + catNumbers(badShards, true)); if (badShards.Count > 0) Log("Unstable shards: " + statusDisplay(badShards, true));
if (deadShards.Count > 0) Log("Shards to be restarted: " + catNumbers(deadShards, false)); if (deadShards.Count > 0) Log("Shards to be restarted: " + statusDisplay(deadShards, false));
{ if (nullShards.Count > 0) Log("Inactive shards: " + statusDisplay(nullShards, false));
// Remove dead shards
foreach (var dead in deadShards)
{
_shards[dead].Dispose();
_shards[dead] = null;
} }
// 120 second delay // Start up any missing shards
await Task.Delay(120 * 1000, _watchdogCancel.Token).ConfigureAwait(false); int startAllowance = 4;
foreach (var id in nullShards)
{
// To avoid possible issues with resources strained over so many shards starting at once,
// initialization is spread out by only starting a few at a time.
if (startAllowance-- > 0)
{
_shards[id] = await InitializeShard(id).ConfigureAwait(false);
}
else break;
}
// All done for now
await Task.Delay(WatchdogInterval * 1000, _watchdogCancel.Token).ConfigureAwait(false);
} }
} }
catch (TaskCanceledException) { } catch (TaskCanceledException) { }