From f7e8c1cf18684839e2076221a24deb69749a8a39 Mon Sep 17 00:00:00 2001 From: Alexei Fedotov Date: Thu, 28 May 2026 12:18:38 +0200 Subject: [PATCH] Add external drain sentinel for graceful Listener shutdown Introduces a sentinel-file check (`/.drain`) at the top of the message-queue loop, decoupled from `RunnerShutdownToken`. When the file is present, the Listener exits after the current iteration completes -- in contrast with SIGTERM/SIGINT, which cancel the in-flight `GetAgentMessageAsync` HTTP call and any pending `/acknowledge` or `/acquirejob` calls via the linked `messageQueueLoopTokenSource`. This addresses a race that affects external supervisors (autoscalers, custom AMIs, ephemeral-runner orchestrators) that need to terminate an idle runner: between deciding the runner is idle and killing it, the broker can dispatch a job onto the open long-poll. SIGTERM mid-dispatch leaves the server with a committed but unservable job. The sentinel lets the supervisor say "stop after the current iteration completes" without forcing mid-HTTP cancellation, eliminating the ack-sent-no- Worker window. No behavior change for runners whose supervisor does not create the file. --- src/Runner.Listener/Runner.cs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/Runner.Listener/Runner.cs b/src/Runner.Listener/Runner.cs index e20eb6256e5..aa112818028 100644 --- a/src/Runner.Listener/Runner.cs +++ b/src/Runner.Listener/Runner.cs @@ -511,8 +511,28 @@ private async Task RunAsync(RunnerSettings settings, bool runOnce = false, jobDispatcher.JobStatus += _listener.OnJobStatus; + // External drain sentinel: an out-of-process supervisor (e.g. an + // autoscaler preparing to terminate an idle ephemeral runner) creates + // this file to request a graceful drain. The check is intentionally + // NOT linked into RunnerShutdownToken, so a drain signal arriving + // mid-iteration cannot abort an in-flight long-poll, /acknowledge, or + // /acquirejob HTTP call. The current iteration runs to completion; + // only the next iteration is skipped. After the loop exits, the + // existing finally block calls DeleteSessionAsync, telling the broker + // to stop dispatching to this session. + string drainSentinelPath = Path.Combine(HostContext.GetDirectory(WellKnownDirectory.Root), ".drain"); + // Clean up any stale sentinel left by a prior process so we do not + // exit immediately on the first iteration. + File.Delete(drainSentinelPath); + while (!HostContext.RunnerShutdownToken.IsCancellationRequested) { + if (File.Exists(drainSentinelPath)) + { + Trace.Info($"Drain sentinel detected at {drainSentinelPath}; exiting message loop after current iteration"); + break; + } + // Check if we need to restart the session and can do so (job dispatcher not busy) if (restartSessionPending && !jobDispatcher.Busy) {