From 1beaebcff88ac72d3fc46f4f9ac6b3fa8617dd5e Mon Sep 17 00:00:00 2001 From: kvega005 Date: Wed, 10 Jun 2026 15:44:51 -0700 Subject: [PATCH 01/22] Use event instead of termination callback --- msipackage/package.wix.in | 8 --- src/windows/WslcSDK/CMakeLists.txt | 2 - src/windows/WslcSDK/TerminationCallback.cpp | 58 ------------------- src/windows/WslcSDK/TerminationCallback.h | 32 ---------- src/windows/WslcSDK/WslcsdkPrivate.h | 3 - src/windows/WslcSDK/wslcsdk.cpp | 43 +++++++++----- src/windows/WslcSDK/wslcsdk.def | 3 +- src/windows/WslcSDK/wslcsdk.h | 9 +-- src/windows/service/exe/HcsVirtualMachine.cpp | 29 ++++++---- src/windows/service/exe/HcsVirtualMachine.h | 5 +- src/windows/service/inc/wslc.idl | 26 +++++---- src/windows/wslcsession/WSLCSession.cpp | 49 +++++++++++++++- src/windows/wslcsession/WSLCSession.h | 7 ++- src/windows/wslcsession/WSLCVirtualMachine.h | 6 ++ test/windows/WSLCTests.cpp | 46 +++++---------- test/windows/WslcSdkTests.cpp | 55 ++++++++---------- 16 files changed, 169 insertions(+), 212 deletions(-) delete mode 100644 src/windows/WslcSDK/TerminationCallback.cpp delete mode 100644 src/windows/WslcSDK/TerminationCallback.h diff --git a/msipackage/package.wix.in b/msipackage/package.wix.in index 2584248b68..f37c0d0dcb 100644 --- a/msipackage/package.wix.in +++ b/msipackage/package.wix.in @@ -306,14 +306,6 @@ - - - - - - - - diff --git a/src/windows/WslcSDK/CMakeLists.txt b/src/windows/WslcSDK/CMakeLists.txt index d9f0b7e778..f1224d9891 100644 --- a/src/windows/WslcSDK/CMakeLists.txt +++ b/src/windows/WslcSDK/CMakeLists.txt @@ -1,7 +1,6 @@ set(SOURCES IOCallback.cpp ProgressCallback.cpp - TerminationCallback.cpp wslcsdk.cpp WslcsdkPrivate.cpp ) @@ -9,7 +8,6 @@ set(HEADERS Defaults.h IOCallback.h ProgressCallback.h - TerminationCallback.h wslcsdk.h WslcsdkPrivate.h ) diff --git a/src/windows/WslcSDK/TerminationCallback.cpp b/src/windows/WslcSDK/TerminationCallback.cpp deleted file mode 100644 index bd98a59529..0000000000 --- a/src/windows/WslcSDK/TerminationCallback.cpp +++ /dev/null @@ -1,58 +0,0 @@ -/*++ - -Copyright (c) Microsoft. All rights reserved. - -Module Name: - - TerminationCallback.cpp - -Abstract: - - Implementation of a type that implements ITerminationCallback. - ---*/ -#include "precomp.h" -#include "TerminationCallback.h" - -namespace { -WslcSessionTerminationReason ConvertReason(WSLCVirtualMachineTerminationReason Reason) -{ - switch (Reason) - { - case WSLCVirtualMachineTerminationReasonShutdown: - return WSLC_SESSION_TERMINATION_REASON_SHUTDOWN; - case WSLCVirtualMachineTerminationReasonCrashed: - return WSLC_SESSION_TERMINATION_REASON_CRASHED; - default: - return WSLC_SESSION_TERMINATION_REASON_UNKNOWN; - } -} -} // namespace - -TerminationCallback::TerminationCallback(WslcSessionTerminationCallback callback, PVOID context) : - m_callback(callback), m_context(context) -{ -} - -// TODO: Details from the runtime are dropped; should the SDK callback function be updated to include the reasons string? -HRESULT STDMETHODCALLTYPE TerminationCallback::OnTermination(WSLCVirtualMachineTerminationReason Reason, LPCWSTR) -{ - if (m_callback) - { - m_callback(ConvertReason(Reason), m_context); - } - - return S_OK; -} - -winrt::com_ptr TerminationCallback::CreateIf(const WslcSessionOptionsInternal* options) -{ - if (options->terminationCallback) - { - return winrt::make_self(options->terminationCallback, options->terminationCallbackContext); - } - else - { - return nullptr; - } -} diff --git a/src/windows/WslcSDK/TerminationCallback.h b/src/windows/WslcSDK/TerminationCallback.h deleted file mode 100644 index afd8542bf3..0000000000 --- a/src/windows/WslcSDK/TerminationCallback.h +++ /dev/null @@ -1,32 +0,0 @@ -/*++ - -Copyright (c) Microsoft. All rights reserved. - -Module Name: - - TerminationCallback.h - -Abstract: - - Header for a type that implements ITerminationCallback. - ---*/ -#pragma once -#include "wslc.h" -#include "wslcsdkprivate.h" -#include - -struct TerminationCallback : public winrt::implements -{ - TerminationCallback(WslcSessionTerminationCallback callback, PVOID context); - - // ITerminationCallback - HRESULT STDMETHODCALLTYPE OnTermination(WSLCVirtualMachineTerminationReason Reason, LPCWSTR Details) override; - - // Creates a TerminationCallback if the options provides a callback. - static winrt::com_ptr CreateIf(const WslcSessionOptionsInternal* options); - -private: - WslcSessionTerminationCallback m_callback = nullptr; - PVOID m_context = nullptr; -}; diff --git a/src/windows/WslcSDK/WslcsdkPrivate.h b/src/windows/WslcSDK/WslcsdkPrivate.h index 36a4568931..3ff96a528a 100644 --- a/src/windows/WslcSDK/WslcsdkPrivate.h +++ b/src/windows/WslcSDK/WslcsdkPrivate.h @@ -33,8 +33,6 @@ typedef struct WslcSessionOptionsInternal WslcVhdRequirements vhdRequirements; WslcSessionFeatureFlags featureFlags; - WslcSessionTerminationCallback terminationCallback; - PVOID terminationCallbackContext; } WslcSessionOptionsInternal; static_assert(sizeof(WslcSessionOptionsInternal) == WSLC_SESSION_OPTIONS_SIZE, "WSLC_SESSION_OPTIONS_INTERNAL size mismatch"); @@ -107,7 +105,6 @@ const WslcContainerOptionsInternal* GetInternalType(const WslcContainerSettings* struct WslcSessionImpl { wil::com_ptr session; - wil::com_ptr terminationCallback; }; WslcSessionImpl* GetInternalType(WslcSession handle); diff --git a/src/windows/WslcSDK/wslcsdk.cpp b/src/windows/WslcSDK/wslcsdk.cpp index 69366079aa..581f091db8 100644 --- a/src/windows/WslcSDK/wslcsdk.cpp +++ b/src/windows/WslcSDK/wslcsdk.cpp @@ -17,7 +17,6 @@ Module Name: #include "WslcsdkPrivate.h" #include "Defaults.h" #include "ProgressCallback.h" -#include "TerminationCallback.h" #include "Localization.h" #include "WslInstall.h" #include "wslutil.h" @@ -434,12 +433,6 @@ try runtimeSettings.MemoryMb = internalType->memoryMb; runtimeSettings.BootTimeoutMs = internalType->timeoutMS; runtimeSettings.NetworkingMode = WSLCNetworkingModeVirtioProxy; - auto terminationCallback = TerminationCallback::CreateIf(internalType); - if (terminationCallback) - { - result->terminationCallback.attach(terminationCallback.as().detach()); - runtimeSettings.TerminationCallback = terminationCallback.get(); - } runtimeSettings.FeatureFlags = ConvertFlags(internalType->featureFlags); WI_SetFlag(runtimeSettings.FeatureFlags, WslcFeatureFlagsVirtioFs); WI_SetFlag(runtimeSettings.FeatureFlags, WslcFeatureFlagsDnsTunneling); @@ -585,15 +578,39 @@ try } CATCH_RETURN(); -STDAPI WslcSetSessionSettingsTerminationCallback( - _In_ WslcSessionSettings* sessionSettings, _In_opt_ WslcSessionTerminationCallback terminationCallback, _In_opt_ PVOID terminationContext) +STDAPI WslcGetSessionTerminationEvent(_In_ WslcSession session, _Out_ HANDLE* terminationEvent) try { - auto internalType = CheckAndGetInternalType(sessionSettings); - RETURN_HR_IF(E_INVALIDARG, terminationCallback == nullptr && terminationContext != nullptr); + RETURN_HR_IF_NULL(E_POINTER, terminationEvent); + *terminationEvent = nullptr; + + auto internalType = CheckAndGetInternalType(session); + RETURN_HR_IF_NULL(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), internalType->session); + + RETURN_HR(internalType->session->GetTerminationEvent(terminationEvent)); +} +CATCH_RETURN(); + +STDAPI WslcGetSessionTerminationReason(_In_ WslcSession session, _Out_ WslcSessionTerminationReason* reason) +try +{ + static_assert( + WSLC_SESSION_TERMINATION_REASON_UNKNOWN == WSLCVirtualMachineTerminationReasonUnknown && + WSLC_SESSION_TERMINATION_REASON_SHUTDOWN == WSLCVirtualMachineTerminationReasonShutdown && + WSLC_SESSION_TERMINATION_REASON_CRASHED == WSLCVirtualMachineTerminationReasonCrashed, + "Termination reason enum values mismatch."); + + RETURN_HR_IF_NULL(E_POINTER, reason); + *reason = WSLC_SESSION_TERMINATION_REASON_UNKNOWN; + + auto internalType = CheckAndGetInternalType(session); + RETURN_HR_IF_NULL(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), internalType->session); + + WSLCVirtualMachineTerminationReason runtimeReason = WSLCVirtualMachineTerminationReasonUnknown; + wil::unique_cotaskmem_string details; + RETURN_IF_FAILED(internalType->session->GetTerminationReason(&runtimeReason, &details)); - internalType->terminationCallback = terminationCallback; - internalType->terminationCallbackContext = terminationContext; + *reason = static_cast(runtimeReason); return S_OK; } diff --git a/src/windows/WslcSDK/wslcsdk.def b/src/windows/WslcSDK/wslcsdk.def index 23a3a1c7fa..db42259c5f 100644 --- a/src/windows/WslcSDK/wslcsdk.def +++ b/src/windows/WslcSDK/wslcsdk.def @@ -16,7 +16,8 @@ WslcReleaseContainer WslcReleaseProcess WslcSetSessionSettingsFeatureFlags -WslcSetSessionSettingsTerminationCallback +WslcGetSessionTerminationEvent +WslcGetSessionTerminationReason WslcSetSessionSettingsCpuCount WslcSetSessionSettingsMemory WslcSetSessionSettingsTimeout diff --git a/src/windows/WslcSDK/wslcsdk.h b/src/windows/WslcSDK/wslcsdk.h index 2abbbc8b48..77b0e23783 100644 --- a/src/windows/WslcSDK/wslcsdk.h +++ b/src/windows/WslcSDK/wslcsdk.h @@ -42,7 +42,7 @@ EXTERN_C_START #define WSLC_E_REGISTRY_BLOCKED_BY_POLICY MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, WSLC_E_BASE + 13) /* 0x8004060D */ // Session values -#define WSLC_SESSION_OPTIONS_SIZE 88 +#define WSLC_SESSION_OPTIONS_SIZE 72 #define WSLC_SESSION_OPTIONS_ALIGNMENT 8 typedef struct WslcSessionSettings @@ -123,8 +123,6 @@ typedef enum WslcSessionTerminationReason WSLC_SESSION_TERMINATION_REASON_CRASHED = 2, } WslcSessionTerminationReason; -typedef __callback void(CALLBACK* WslcSessionTerminationCallback)(_In_ WslcSessionTerminationReason reason, _In_opt_ PVOID context); - STDAPI WslcInitSessionSettings(_In_ PCWSTR name, _In_ PCWSTR storagePath, _Out_ WslcSessionSettings* sessionSettings); STDAPI WslcCreateSession(_In_ WslcSessionSettings* sessionSettings, _Out_ WslcSession* session, _Outptr_opt_result_z_ PWSTR* errorMessage); @@ -138,9 +136,8 @@ STDAPI WslcSetSessionSettingsVhd(_In_ WslcSessionSettings* sessionSettings, _In_ STDAPI WslcSetSessionSettingsFeatureFlags(_In_ WslcSessionSettings* sessionSettings, _In_ WslcSessionFeatureFlags flags); -// Pass in Null for callback to clear the termination callback -STDAPI WslcSetSessionSettingsTerminationCallback( - _In_ WslcSessionSettings* sessionSettings, _In_opt_ WslcSessionTerminationCallback terminationCallback, _In_opt_ PVOID terminationContext); +STDAPI WslcGetSessionTerminationEvent(_In_ WslcSession session, _Out_ HANDLE* terminationEvent); +STDAPI WslcGetSessionTerminationReason(_In_ WslcSession session, _Out_ WslcSessionTerminationReason* reason); STDAPI WslcTerminateSession(_In_ WslcSession session); STDAPI WslcReleaseSession(_In_ WslcSession session); diff --git a/src/windows/service/exe/HcsVirtualMachine.cpp b/src/windows/service/exe/HcsVirtualMachine.cpp index 0bfb5062b0..424b9bd6f8 100644 --- a/src/windows/service/exe/HcsVirtualMachine.cpp +++ b/src/windows/service/exe/HcsVirtualMachine.cpp @@ -285,12 +285,6 @@ HcsVirtualMachine::HcsVirtualMachine(_In_ const WSLCSessionSettings* Settings) m_guestDeviceManager = std::make_shared<::GuestDeviceManager>(m_vmIdString, m_vmId); } - // Configure termination callback - if (Settings->TerminationCallback) - { - m_terminationCallback = Settings->TerminationCallback; - } - hcs::RegisterCallback(m_computeSystem.get(), &HcsVirtualMachine::OnVmExitCallback, this); // Create a listening socket for mini_init to connect to once the VM is running. @@ -686,8 +680,6 @@ CATCH_LOG() void HcsVirtualMachine::OnExit(const HCS_EVENT* Event) { - m_vmExitEvent.SetEvent(); - const auto exitStatus = wsl::shared::FromJson(Event->EventData); auto reason = WSLCVirtualMachineTerminationReasonUnknown; @@ -709,11 +701,28 @@ void HcsVirtualMachine::OnExit(const HCS_EVENT* Event) } } - if (m_terminationCallback) + // Cache the termination reason and details before signaling the exit event. { - LOG_IF_FAILED(m_terminationCallback->OnTermination(reason, Event->EventData)); + std::lock_guard lock(m_lock); + m_terminationReason = reason; + m_terminationDetails = Event->EventData; } + + m_vmExitEvent.SetEvent(); +} + +HRESULT HcsVirtualMachine::GetTerminationReason(_Out_ WSLCVirtualMachineTerminationReason* Reason, _Out_ LPWSTR* Details) +try +{ + RETURN_HR_IF(E_POINTER, Reason == nullptr || Details == nullptr); + + std::lock_guard lock(m_lock); + *Reason = m_terminationReason; + *Details = wil::make_cotaskmem_string(m_terminationDetails.c_str()).release(); + + return S_OK; } +CATCH_RETURN() void HcsVirtualMachine::OnCrash(const HCS_EVENT* Event) { diff --git a/src/windows/service/exe/HcsVirtualMachine.h b/src/windows/service/exe/HcsVirtualMachine.h index b8eba94902..5589d90842 100644 --- a/src/windows/service/exe/HcsVirtualMachine.h +++ b/src/windows/service/exe/HcsVirtualMachine.h @@ -46,6 +46,7 @@ class HcsVirtualMachine IFACEMETHOD(RemoveShare)(_In_ REFGUID ShareId) override; IFACEMETHOD(ApplyGuestCapabilities)(_In_ const WSLCGuestCapabilities* Capabilities) override; IFACEMETHOD(GetTerminationEvent)(_Out_ HANDLE* Event) override; + IFACEMETHOD(GetTerminationReason)(_Out_ WSLCVirtualMachineTerminationReason* Reason, _Out_ LPWSTR* Details) override; private: struct DiskInfo @@ -102,7 +103,9 @@ class HcsVirtualMachine std::atomic m_vmSavedStateCaptured = false; std::atomic m_crashLogCaptured = false; - wil::com_ptr m_terminationCallback; + // Termination reason and details, cached when the VM exits (see OnExit). Guarded by m_lock. + WSLCVirtualMachineTerminationReason m_terminationReason{WSLCVirtualMachineTerminationReasonUnknown}; + std::wstring m_terminationDetails; }; } // namespace wsl::windows::service::wslc diff --git a/src/windows/service/inc/wslc.idl b/src/windows/service/inc/wslc.idl index a1a89dc9fd..1af8cf7d45 100644 --- a/src/windows/service/inc/wslc.idl +++ b/src/windows/service/inc/wslc.idl @@ -97,16 +97,6 @@ typedef enum _WSLCSignal WSLCSignalSIGSYS = 31 } WSLCSignal; -[ - uuid(7BC4E198-6531-4FA6-ADE2-5EF3D2A04DFE), - pointer_default(unique), - object -] -interface ITerminationCallback : IUnknown -{ - HRESULT OnTermination(WSLCVirtualMachineTerminationReason Reason, LPCWSTR Details); -}; - [ uuid(5038842F-53DB-4F30-A6D0-A41B02C94AC1), pointer_default(unique), @@ -498,6 +488,11 @@ interface IWSLCVirtualMachine : IUnknown // Returns an event that is signaled when the VM exits (graceful or forced). HRESULT GetTerminationEvent([out, system_handle(sh_event)] HANDLE* Event); + + // Returns the cached termination reason and details. The values are only meaningful + // after the termination event has been signaled; before that the reason is + // WSLCVirtualMachineTerminationReasonUnknown and Details is an empty string. + HRESULT GetTerminationReason([out] WSLCVirtualMachineTerminationReason* Reason, [out] LPWSTR* Details); } typedef enum _WSLCSessionStorageFlags @@ -518,7 +513,6 @@ typedef struct _WSLCSessionSettings { ULONG MemoryMb; ULONG BootTimeoutMs; WSLCNetworkingMode NetworkingMode; - [unique] ITerminationCallback* TerminationCallback; WSLCFeatureFlags FeatureFlags; WSLCHandle DmesgOutput; WSLCSessionStorageFlags StorageFlags; @@ -728,6 +722,16 @@ interface IWSLCSession : IUnknown HRESULT GetId([out] ULONG* Id); HRESULT GetState([out] WSLCSessionState* State); + // Returns a one-off event that is signaled when the session terminates, whether due to an + // explicit Terminate() call or an unexpected VM exit. The returned handle is owned by the + // caller and remains valid (and observes the signaled state) even after the session is released. + HRESULT GetTerminationEvent([out, system_handle(sh_event)] HANDLE* Event); + + // Returns the cached termination reason and details. The values are only meaningful after the + // termination event has been signaled; before that the reason is + // WSLCVirtualMachineTerminationReasonUnknown and Details is an empty string. + HRESULT GetTerminationReason([out] WSLCVirtualMachineTerminationReason* Reason, [out] LPWSTR* Details); + // Image management. HRESULT PullImage([in] LPCSTR Image, [in, unique] LPCSTR RegistryAuthenticationInformation, [in, unique] IProgressCallback* ProgressCallback); HRESULT BuildImage([in] const WSLCBuildImageOptions* Options, [in, unique] IProgressCallback* ProgressCallback, [in, unique, system_handle(sh_event)] HANDLE CancelEvent); diff --git a/src/windows/wslcsession/WSLCSession.cpp b/src/windows/wslcsession/WSLCSession.cpp index c08957ac0b..8726e73a82 100644 --- a/src/windows/wslcsession/WSLCSession.cpp +++ b/src/windows/wslcsession/WSLCSession.cpp @@ -2479,9 +2479,20 @@ try if (m_vmExitedEvent && m_vmExitedEvent.is_signaled()) { WSL_LOG("SkippingGracefulShutdown_VmDead", TraceLoggingValue(m_id, "SessionId")); + + // The VM exited on its own, so it recorded the cause. + if (m_virtualMachine) + { + wil::unique_cotaskmem_string details; + LOG_IF_FAILED(m_virtualMachine->GetTerminationReason(&m_terminationReason, &details)); + m_terminationDetails = details ? details.get() : L""; + } } else { + // The VM is still alive, so this is a graceful shutdown initiated by us. + m_terminationReason = WSLCVirtualMachineTerminationReasonShutdown; + if (m_virtualMachine) { m_virtualMachine->OnSessionTerminated(); @@ -2520,7 +2531,8 @@ try m_swapVhdPath.clear(); } - m_terminated = true; + m_sessionTerminatedEvent.SetEvent(); + return S_OK; } CATCH_RETURN(); @@ -2739,9 +2751,42 @@ void WSLCSession::OnContainerDeleted(const WSLCContainerImpl* Container) HRESULT WSLCSession::GetState(_Out_ WSLCSessionState* State) { - *State = m_terminated ? WSLCSessionStateTerminated : WSLCSessionStateRunning; + *State = m_sessionTerminatedEvent.is_signaled() ? WSLCSessionStateTerminated : WSLCSessionStateRunning; + return S_OK; +} + +HRESULT WSLCSession::GetTerminationEvent(_Out_ HANDLE* Event) +try +{ + RETURN_HR_IF(E_POINTER, Event == nullptr); + + // Duplicate the "terminated" event (signaled once the session is fully done, via either an + // explicit Terminate() or an unexpected VM exit). The caller owns the returned handle, which + // stays valid (and observes the signaled state) even after the session is released. + *Event = wsl::windows::common::wslutil::DuplicateHandle(m_sessionTerminatedEvent.get()); + return S_OK; } +CATCH_RETURN(); + +HRESULT WSLCSession::GetTerminationReason(_Out_ WSLCVirtualMachineTerminationReason* Reason, _Out_ LPWSTR* Details) +try +{ + RETURN_HR_IF(E_POINTER, Reason == nullptr || Details == nullptr); + + auto lock = m_lock.lock_shared(); + + // The termination reason is only populated once the session has finished terminating. Fail + // rather than returning a placeholder so callers can distinguish "still running" from + // "terminated for an unknown reason". + RETURN_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_sessionTerminatedEvent.is_signaled()); + + *Reason = m_terminationReason; + *Details = wil::make_cotaskmem_string(m_terminationDetails.c_str()).release(); + + return S_OK; +} +CATCH_RETURN(); void WSLCSession::RecoverExistingContainers() { diff --git a/src/windows/wslcsession/WSLCSession.h b/src/windows/wslcsession/WSLCSession.h index 5a963140a8..dd924a7056 100644 --- a/src/windows/wslcsession/WSLCSession.h +++ b/src/windows/wslcsession/WSLCSession.h @@ -89,6 +89,8 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession IFACEMETHOD(GetId)(_Out_ ULONG* Id) override; IFACEMETHOD(GetState)(_Out_ WSLCSessionState* State) override; + IFACEMETHOD(GetTerminationEvent)(_Out_ HANDLE* Event) override; + IFACEMETHOD(GetTerminationReason)(_Out_ WSLCVirtualMachineTerminationReason* Reason, _Out_ LPWSTR* Details) override; // Image management. IFACEMETHOD(PullImage)(_In_ LPCSTR Image, _In_opt_ LPCSTR RegistryAuthenticationInformation, _In_opt_ IProgressCallback* ProgressCallback) override; @@ -229,7 +231,11 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession std::mutex m_networksLock; std::unordered_map m_networks; wil::unique_event m_sessionTerminatingEvent{wil::EventOptions::ManualReset}; + wil::unique_event m_sessionTerminatedEvent{wil::EventOptions::ManualReset}; wil::unique_event m_vmExitedEvent; + + WSLCVirtualMachineTerminationReason m_terminationReason{WSLCVirtualMachineTerminationReasonUnknown}; + std::wstring m_terminationDetails; wil::srwlock m_lock; IORelay m_ioRelay; std::optional m_containerdProcess; @@ -237,7 +243,6 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession WSLCFeatureFlags m_featureFlags{}; std::function m_destructionCallback; std::atomic m_terminating{false}; - std::atomic m_terminated{false}; wil::com_ptr m_pluginNotifier; diff --git a/src/windows/wslcsession/WSLCVirtualMachine.h b/src/windows/wslcsession/WSLCVirtualMachine.h index a38588a74c..ad68629059 100644 --- a/src/windows/wslcsession/WSLCVirtualMachine.h +++ b/src/windows/wslcsession/WSLCVirtualMachine.h @@ -162,6 +162,12 @@ class WSLCVirtualMachine return m_vmTerminatingEvent.get(); } + // Retrieves the cached termination reason and details from the underlying VM. + HRESULT GetTerminationReason(_Out_ WSLCVirtualMachineTerminationReason* Reason, _Out_ LPWSTR* Details) const + { + return m_vm->GetTerminationReason(Reason, Details); + } + GUID VmId() const { return m_vmId; diff --git a/test/windows/WSLCTests.cpp b/test/windows/WSLCTests.cpp index 0650cc26f6..f6aa743693 100644 --- a/test/windows/WSLCTests.cpp +++ b/test/windows/WSLCTests.cpp @@ -2711,46 +2711,26 @@ class WSLCTests } } - WSLC_TEST_METHOD(TerminationCallback) + WSLC_TEST_METHOD(TerminationEvent) { - class DECLSPEC_UUID("7BC4E198-6531-4FA6-ADE2-5EF3D2A04DFF") CallbackInstance - : public Microsoft::WRL::RuntimeClass, ITerminationCallback, IFastRundown> - { - - public: - CallbackInstance(std::function&& callback) : - m_callback(std::move(callback)) - { - } - - HRESULT OnTermination(WSLCVirtualMachineTerminationReason Reason, LPCWSTR Details) override - { - m_callback(Reason, Details); - return S_OK; - } - - private: - std::function m_callback; - }; + auto session = CreateSession(GetDefaultSessionSettings(L"termination-event-test")); - std::promise> promise; + wil::unique_handle terminationEvent; + VERIFY_SUCCEEDED(session->GetTerminationEvent(&terminationEvent)); + VERIFY_IS_NOT_NULL(terminationEvent.get()); - CallbackInstance callback{[&](WSLCVirtualMachineTerminationReason reason, LPCWSTR details) { - promise.set_value(std::make_pair(reason, details)); - }}; + // The reason is unavailable until the session has terminated. + WSLCVirtualMachineTerminationReason reason{}; + wil::unique_cotaskmem_string details; + VERIFY_ARE_EQUAL(session->GetTerminationReason(&reason, &details), HRESULT_FROM_WIN32(ERROR_INVALID_STATE)); - WSLCSessionSettings sessionSettings = GetDefaultSessionSettings(L"termination-callback-test"); - sessionSettings.TerminationCallback = &callback; + // Terminating the session should signal the event and record a graceful shutdown reason. + VERIFY_SUCCEEDED(session->Terminate()); - auto session = CreateSession(sessionSettings); + VERIFY_ARE_EQUAL(WaitForSingleObject(terminationEvent.get(), 30 * 1000), static_cast(WAIT_OBJECT_0)); - session.reset(); - auto future = promise.get_future(); - auto result = future.wait_for(std::chrono::seconds(30)); - VERIFY_ARE_EQUAL(result, std::future_status::ready); - auto [reason, details] = future.get(); + VERIFY_SUCCEEDED(session->GetTerminationReason(&reason, &details)); VERIFY_ARE_EQUAL(reason, WSLCVirtualMachineTerminationReasonShutdown); - VERIFY_ARE_NOT_EQUAL(details, L""); } WSLC_TEST_METHOD(BuildImageStuckCallbackCancellation) diff --git a/test/windows/WslcSdkTests.cpp b/test/windows/WslcSdkTests.cpp index e676b4128e..8cce628dc7 100644 --- a/test/windows/WslcSdkTests.cpp +++ b/test/windows/WslcSdkTests.cpp @@ -263,60 +263,53 @@ class WslcSdkTests VERIFY_ARE_EQUAL(WslcCreateSession(nullptr, &session2, nullptr), E_POINTER); } - WSLC_TEST_METHOD(TerminationCallbackViaTerminate) + WSLC_TEST_METHOD(TerminationEventViaTerminate) { - std::promise promise; - - auto callback = [](WslcSessionTerminationReason reason, PVOID context) { - auto* p = static_cast*>(context); - p->set_value(reason); - }; - - std::filesystem::path extraStorage = m_storagePath / "wslc-termcb-term-storage"; + std::filesystem::path extraStorage = m_storagePath / "wslc-termevt-term-storage"; WslcSessionSettings sessionSettings; - VERIFY_SUCCEEDED(WslcInitSessionSettings(L"wslc-termcb-term-test", extraStorage.c_str(), &sessionSettings)); + VERIFY_SUCCEEDED(WslcInitSessionSettings(L"wslc-termevt-term-test", extraStorage.c_str(), &sessionSettings)); VERIFY_SUCCEEDED(WslcSetSessionSettingsTimeout(&sessionSettings, 30 * 1000)); - VERIFY_SUCCEEDED(WslcSetSessionSettingsTerminationCallback(&sessionSettings, callback, &promise)); UniqueSession session; VERIFY_SUCCEEDED(WslcCreateSession(&sessionSettings, &session, nullptr)); - // Terminating the session should trigger a graceful shutdown and fire the callback. + wil::unique_handle terminationEvent; + VERIFY_SUCCEEDED(WslcGetSessionTerminationEvent(session.get(), &terminationEvent)); + VERIFY_IS_NOT_NULL(terminationEvent.get()); + + // Terminating the session should trigger a graceful shutdown and signal the event. VERIFY_SUCCEEDED(WslcTerminateSession(session.get())); - auto future = promise.get_future(); - VERIFY_ARE_EQUAL(future.wait_for(std::chrono::seconds(30)), std::future_status::ready); - VERIFY_ARE_EQUAL(future.get(), WSLC_SESSION_TERMINATION_REASON_SHUTDOWN); + VERIFY_ARE_EQUAL(WaitForSingleObject(terminationEvent.get(), 30 * 1000), static_cast(WAIT_OBJECT_0)); + + WslcSessionTerminationReason reason = WSLC_SESSION_TERMINATION_REASON_UNKNOWN; + VERIFY_SUCCEEDED(WslcGetSessionTerminationReason(session.get(), &reason)); + VERIFY_ARE_EQUAL(reason, WSLC_SESSION_TERMINATION_REASON_SHUTDOWN); } - WSLC_TEST_METHOD(TerminationCallbackViaRelease) + WSLC_TEST_METHOD(TerminationEventViaRelease) { - std::promise promise; - - auto callback = [](WslcSessionTerminationReason reason, PVOID context) { - auto* p = static_cast*>(context); - p->set_value(reason); - }; - - std::filesystem::path extraStorage = m_storagePath / "wslc-termcb-release-storage"; + std::filesystem::path extraStorage = m_storagePath / "wslc-termevt-release-storage"; WslcSessionSettings sessionSettings; - VERIFY_SUCCEEDED(WslcInitSessionSettings(L"wslc-termcb-release-test", extraStorage.c_str(), &sessionSettings)); + VERIFY_SUCCEEDED(WslcInitSessionSettings(L"wslc-termevt-release-test", extraStorage.c_str(), &sessionSettings)); VERIFY_SUCCEEDED(WslcSetSessionSettingsTimeout(&sessionSettings, 30 * 1000)); - VERIFY_SUCCEEDED(WslcSetSessionSettingsTerminationCallback(&sessionSettings, callback, &promise)); UniqueSession session; VERIFY_SUCCEEDED(WslcCreateSession(&sessionSettings, &session, nullptr)); - // Releasing the session should trigger a graceful shutdown and fire the callback. + // The termination event is owned by the caller and stays valid even after the session is released. + wil::unique_handle terminationEvent; + VERIFY_SUCCEEDED(WslcGetSessionTerminationEvent(session.get(), &terminationEvent)); + VERIFY_IS_NOT_NULL(terminationEvent.get()); + + // Releasing the session should trigger a graceful shutdown and signal the event. VERIFY_SUCCEEDED(WslcReleaseSession(session.get())); - // Calling WslcSessionRelease will destroy the session + // Calling WslcReleaseSession will destroy the session. session.release(); - auto future = promise.get_future(); - VERIFY_ARE_EQUAL(future.wait_for(std::chrono::seconds(30)), std::future_status::ready); - VERIFY_ARE_EQUAL(future.get(), WSLC_SESSION_TERMINATION_REASON_SHUTDOWN); + VERIFY_ARE_EQUAL(WaitForSingleObject(terminationEvent.get(), 30 * 1000), static_cast(WAIT_OBJECT_0)); } // ----------------------------------------------------------------------- From c9da34e51214f540e5d0ccc1ed3b78268dfdf3d0 Mon Sep 17 00:00:00 2001 From: kvega005 Date: Wed, 10 Jun 2026 16:20:32 -0700 Subject: [PATCH 02/22] Fux build --- src/windows/WslcSDK/winrt/Session.cpp | 26 +++++++++++++++++++++++--- src/windows/WslcSDK/winrt/Session.h | 10 +++++++--- src/windows/WslcSDK/wslcsdk.cpp | 3 --- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/windows/WslcSDK/winrt/Session.cpp b/src/windows/WslcSDK/winrt/Session.cpp index 5a29437c99..1234b60094 100644 --- a/src/windows/WslcSDK/winrt/Session.cpp +++ b/src/windows/WslcSDK/winrt/Session.cpp @@ -45,6 +45,18 @@ Session::Session(winrt::Microsoft::WSL::Containers::SessionSettings const& setti } } +Session::~Session() +{ + // Stop watching the termination handle and drain any in-flight callback before the rest of the + // session is torn down. Releasing the session handle below can signal the termination event, so the + // wait must be cancelled first to avoid the callback running against a half-destructed object. + if (m_terminationWait) + { + SetThreadpoolWait(m_terminationWait.get(), nullptr, nullptr); + WaitForThreadpoolWaitCallbacks(m_terminationWait.get(), TRUE); + } +} + void Session::Start() { if (m_session) @@ -52,12 +64,16 @@ void Session::Start() throw winrt::hresult_illegal_method_call(L"Session has already been started"); } - winrt::check_hresult(WslcSetSessionSettingsTerminationCallback(GetStructPointer(m_settings), TerminatedCallback, /* context */ this)); - wil::unique_cotaskmem_string errorMessage; auto hr = WslcCreateSession(GetStructPointer(m_settings), m_session.put(), errorMessage.put()); THROW_MSG_IF_FAILED(hr, errorMessage); m_settings = nullptr; + + winrt::check_hresult(WslcGetSessionTerminationEvent(m_session.get(), m_terminationEvent.put())); + + m_terminationWait.reset(CreateThreadpoolWait(&Session::OnTerminated, this, nullptr)); + THROW_LAST_ERROR_IF_NULL(m_terminationWait); + SetThreadpoolWait(m_terminationWait.get(), m_terminationEvent.get(), nullptr); } void Session::EnsureStarted() const @@ -300,11 +316,15 @@ WslcSession Session::ToHandle() return m_session.get(); } -void CALLBACK Session::TerminatedCallback(_In_ WslcSessionTerminationReason reason, _In_opt_ PVOID context) noexcept +void CALLBACK Session::OnTerminated(PTP_CALLBACK_INSTANCE /* instance */, PVOID context, PTP_WAIT /* wait */, TP_WAIT_RESULT /* waitResult */) noexcept { try { auto session = static_cast(context); + + WslcSessionTerminationReason reason = WSLC_SESSION_TERMINATION_REASON_UNKNOWN; + LOG_IF_FAILED(WslcGetSessionTerminationReason(session->m_session.get(), &reason)); + session->m_terminatedEvent(static_cast(reason)); } CATCH_LOG(); diff --git a/src/windows/WslcSDK/winrt/Session.h b/src/windows/WslcSDK/winrt/Session.h index 954591262c..cac2d6488e 100644 --- a/src/windows/WslcSDK/winrt/Session.h +++ b/src/windows/WslcSDK/winrt/Session.h @@ -20,6 +20,7 @@ struct Session : SessionT { Session() = default; Session(winrt::Microsoft::WSL::Containers::SessionSettings const& settings); + ~Session(); void Start(); void Terminate(); @@ -45,12 +46,15 @@ struct Session : SessionT void EnsureStarted() const; winrt::Microsoft::WSL::Containers::SessionSettings m_settings; // Only kept until Start() is called - static void CALLBACK TerminatedCallback(_In_ WslcSessionTerminationReason reason, _In_opt_ PVOID context) noexcept; + // Threadpool callback that raises the Terminated event once the session's termination handle is signaled. + static void CALLBACK OnTerminated(PTP_CALLBACK_INSTANCE instance, PVOID context, PTP_WAIT wait, TP_WAIT_RESULT waitResult) noexcept; - // Releasing the session handle may trigger the termination callback. - // Keep these two in this order so that the session handle is released before the termination event is destructed. winrt::event m_terminatedEvent; wil::unique_any m_session{nullptr}; + + // Bridges the one-off termination event surfaced by the SDK to the WinRT Terminated event. + wil::unique_handle m_terminationEvent; + wil::unique_threadpool_wait m_terminationWait; }; } // namespace winrt::Microsoft::WSL::Containers::implementation namespace winrt::Microsoft::WSL::Containers::factory_implementation { diff --git a/src/windows/WslcSDK/wslcsdk.cpp b/src/windows/WslcSDK/wslcsdk.cpp index 581f091db8..9d39744aaa 100644 --- a/src/windows/WslcSDK/wslcsdk.cpp +++ b/src/windows/WslcSDK/wslcsdk.cpp @@ -621,10 +621,7 @@ try { auto internalType = CheckAndGetInternalTypeUniquePointer(session); - // Intentionally destroy session before termination callback in the event that - // the termination callback ends up being invoked by session destruction. internalType->session.reset(); - internalType->terminationCallback.reset(); return S_OK; } From 2fb90c0268f5954c9da93e55f6215c984b178e2e Mon Sep 17 00:00:00 2001 From: kvega005 Date: Wed, 10 Jun 2026 16:37:45 -0700 Subject: [PATCH 03/22] Fix --- src/windows/wslcsession/WSLCSession.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/windows/wslcsession/WSLCSession.cpp b/src/windows/wslcsession/WSLCSession.cpp index 8726e73a82..0568347911 100644 --- a/src/windows/wslcsession/WSLCSession.cpp +++ b/src/windows/wslcsession/WSLCSession.cpp @@ -2760,9 +2760,9 @@ try { RETURN_HR_IF(E_POINTER, Event == nullptr); - // Duplicate the "terminated" event (signaled once the session is fully done, via either an - // explicit Terminate() or an unexpected VM exit). The caller owns the returned handle, which - // stays valid (and observes the signaled state) even after the session is released. + *Event = nullptr; + + // Duplicate the "terminated" event. The caller owns the returned handle, which stays valid even after the session is released. *Event = wsl::windows::common::wslutil::DuplicateHandle(m_sessionTerminatedEvent.get()); return S_OK; @@ -2774,11 +2774,11 @@ try { RETURN_HR_IF(E_POINTER, Reason == nullptr || Details == nullptr); + *Reason = WSLCVirtualMachineTerminationReasonUnknown; + *Details = nullptr; + auto lock = m_lock.lock_shared(); - // The termination reason is only populated once the session has finished terminating. Fail - // rather than returning a placeholder so callers can distinguish "still running" from - // "terminated for an unknown reason". RETURN_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_sessionTerminatedEvent.is_signaled()); *Reason = m_terminationReason; From be6b59cbda9a7e23485eb3586604874f6df11a7c Mon Sep 17 00:00:00 2001 From: kvega005 Date: Wed, 10 Jun 2026 16:43:16 -0700 Subject: [PATCH 04/22] Address feedback --- src/windows/wslcsession/WSLCSession.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/windows/wslcsession/WSLCSession.cpp b/src/windows/wslcsession/WSLCSession.cpp index 0568347911..0ac3c3a797 100644 --- a/src/windows/wslcsession/WSLCSession.cpp +++ b/src/windows/wslcsession/WSLCSession.cpp @@ -2777,8 +2777,8 @@ try *Reason = WSLCVirtualMachineTerminationReasonUnknown; *Details = nullptr; - auto lock = m_lock.lock_shared(); - + // m_terminationReason/m_terminationDetails are written once before m_sessionTerminatedEvent is + // signaled and never modified afterward, so observing the signaled event safely publishes them. RETURN_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_sessionTerminatedEvent.is_signaled()); *Reason = m_terminationReason; From aa4bd4bc938b5d1f506e51ac8d85fa3bea493a15 Mon Sep 17 00:00:00 2001 From: kvega005 Date: Thu, 11 Jun 2026 15:10:58 -0700 Subject: [PATCH 05/22] Address feedback. --- src/windows/WslcSDK/winrt/Session.cpp | 12 ------------ src/windows/WslcSDK/winrt/Session.h | 1 - src/windows/wslcsession/WSLCSession.cpp | 2 +- 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/src/windows/WslcSDK/winrt/Session.cpp b/src/windows/WslcSDK/winrt/Session.cpp index 1234b60094..46bb7afdcc 100644 --- a/src/windows/WslcSDK/winrt/Session.cpp +++ b/src/windows/WslcSDK/winrt/Session.cpp @@ -45,18 +45,6 @@ Session::Session(winrt::Microsoft::WSL::Containers::SessionSettings const& setti } } -Session::~Session() -{ - // Stop watching the termination handle and drain any in-flight callback before the rest of the - // session is torn down. Releasing the session handle below can signal the termination event, so the - // wait must be cancelled first to avoid the callback running against a half-destructed object. - if (m_terminationWait) - { - SetThreadpoolWait(m_terminationWait.get(), nullptr, nullptr); - WaitForThreadpoolWaitCallbacks(m_terminationWait.get(), TRUE); - } -} - void Session::Start() { if (m_session) diff --git a/src/windows/WslcSDK/winrt/Session.h b/src/windows/WslcSDK/winrt/Session.h index cac2d6488e..f6e8f7bd14 100644 --- a/src/windows/WslcSDK/winrt/Session.h +++ b/src/windows/WslcSDK/winrt/Session.h @@ -20,7 +20,6 @@ struct Session : SessionT { Session() = default; Session(winrt::Microsoft::WSL::Containers::SessionSettings const& settings); - ~Session(); void Start(); void Terminate(); diff --git a/src/windows/wslcsession/WSLCSession.cpp b/src/windows/wslcsession/WSLCSession.cpp index 81272d86d0..d2fcde2a47 100644 --- a/src/windows/wslcsession/WSLCSession.cpp +++ b/src/windows/wslcsession/WSLCSession.cpp @@ -2979,7 +2979,7 @@ try *Event = nullptr; // Duplicate the "terminated" event. The caller owns the returned handle, which stays valid even after the session is released. - *Event = wsl::windows::common::wslutil::DuplicateHandle(m_sessionTerminatedEvent.get()); + *Event = wsl::windows::common::wslutil::DuplicateHandle(m_sessionTerminatedEvent.get(), SYNCHRONIZE); return S_OK; } From 35632550cd345cd980eabf0eb41a921610fe1892 Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Tue, 9 Jun 2026 22:56:56 -0700 Subject: [PATCH 06/22] wslc: idle-terminate per-user VMs when no containers are active Summary ------- Decouples a wslc session's VM lifetime from the wslcsession.exe process so the VM can be torn down when nothing needs it and recreated on demand later, while the per-user session and its bookkeeping survive across VM restarts. Previously a VM was created 1:1 with a session: the SYSTEM service eagerly built an HcsVirtualMachine and handed the COM pointer to wslcsession.exe, and any VM exit permanently terminated the (single-shot) session. Detailed description -------------------- * New service-side IWSLCVirtualMachineFactory lets the per-user process mint a fresh VM at any time. IWSLCSessionFactory::CreateSession / IWSLCSession::Initialize now take the factory instead of an eager IWSLCVirtualMachine. WSLCVirtualMachineFactory deep-copies the settings and duplicates the dmesg handle per creation. * WSLCSession Initialize is now lightweight (persists settings, starts the idle worker). VM bring-up/teardown is split into re-runnable StartVmLockHeld / StopVmLockHeld / TearDownVmLockHeld driven by a VmState machine (None/Starting/Running/Stopping). * On-demand bring-up + idle teardown: every VM-requiring operation takes a VmLease RAII (EnsureVmRunning + activity count); when activity drops to zero and no container is in the Created or Running state, the idle worker tears the VM down immediately. Idle termination is enabled only for persistent-storage sessions. * VM-exit disambiguation: intentional stops (m_vmStopRequested) keep the session alive; unexpected exits still Terminate() permanently. * New IWSLCSession::GetVmDiagnostics (Running + StartCount) exposes VM lifecycle for tests/diagnostics without bringing the VM up or counting as activity. Concurrency fixes folded in (compile-validated; flagged for runtime stress): * IORelay self-join: TearDownVmLockHeld no longer destroys the IO relay from its own thread (added IORelay::IsRelayThread); the stopped relay is left for ~WSLCSession on a non-relay thread. * Lease-vs-idle-stop race: VmLease retries instead of throwing ERROR_INVALID_STATE when the idle worker tears down in the bring-up window. * Idle-worker-vs-crash deadlock: IdleWorker bails when the VM exit event is already signaled, letting the relay-thread Terminate path own teardown. Validation steps ----------------- * Full solution build (x64 Debug) green, including wsltests.dll. * Copyright-header validation: no new violations. * Added E2E tests (test/windows/wslc/e2e/WSLCE2EVmIdleTests.cpp): lazy start + idle stop, recreate-on-demand + state persistence, Created container keeps VM alive, and concurrent recreate stress (lease/idle race). * NOT runtime-validated here (requires deploy + Administrator + container runtime); run bin\x64\Debug\test.bat /name:*VmIdle* and stress the two race fixes. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../service/exe/WSLCSessionManager.cpp | 3 +- src/windows/service/inc/wslc.idl | 19 +- src/windows/wslcsession/IORelay.cpp | 5 + src/windows/wslcsession/IORelay.h | 6 + src/windows/wslcsession/WSLCContainer.cpp | 4 + src/windows/wslcsession/WSLCSession.cpp | 614 ++++++++++++++---- src/windows/wslcsession/WSLCSession.h | 75 ++- test/windows/wslc/e2e/WSLCE2EHelpers.h | 5 + test/windows/wslc/e2e/WSLCE2EVmIdleTests.cpp | 143 ++++ 9 files changed, 735 insertions(+), 139 deletions(-) create mode 100644 test/windows/wslc/e2e/WSLCE2EVmIdleTests.cpp diff --git a/src/windows/service/exe/WSLCSessionManager.cpp b/src/windows/service/exe/WSLCSessionManager.cpp index 339f3ce6ca..e08883608d 100644 --- a/src/windows/service/exe/WSLCSessionManager.cpp +++ b/src/windows/service/exe/WSLCSessionManager.cpp @@ -270,8 +270,7 @@ void WSLCSessionManagerImpl::CreateSession( g_pluginManager, sessionId, creatorPid, std::wstring(resolvedDisplayName), wil::shared_handle(sharedToken), std::vector(storedSid)); // Create the VM factory in the SYSTEM service (privileged). The per-user session - // uses it to create the VM. Funneling VM creation through a factory lets the session - // own when VMs are created, rather than having one handed to it up front. + // uses it to create VMs on demand and recreate them after idle-termination. auto vmFactory = Microsoft::WRL::Make(Settings); // Launch per-user COM server factory and add it to a fresh per-session job object for crash cleanup. diff --git a/src/windows/service/inc/wslc.idl b/src/windows/service/inc/wslc.idl index c935ea852a..5d029a410d 100644 --- a/src/windows/service/inc/wslc.idl +++ b/src/windows/service/inc/wslc.idl @@ -756,6 +756,18 @@ typedef enum _WSLCSessionState WSLCSessionStateTerminated = 1 } WSLCSessionState; +// Diagnostics describing the session's on-demand VM lifecycle. Used by tests and +// troubleshooting to observe lazy VM bring-up and idle termination without affecting +// the VM (querying these does not bring the VM up or count as activity). +typedef struct _WSLCVmDiagnostics +{ + // TRUE when the backing VM is currently running. + boolean Running; + + // Number of times the VM has been (re)created over the session's lifetime. + unsigned long StartCount; +} WSLCVmDiagnostics; + // Settings for IWSLCSession::Initialize - passed from service to per-user process typedef struct _WSLCSessionInitSettings { @@ -792,6 +804,9 @@ interface IWSLCSession : IUnknown // WSLCVirtualMachineTerminationReasonUnknown and Details is an empty string. HRESULT GetTerminationReason([out] WSLCVirtualMachineTerminationReason* Reason, [out] LPWSTR* Details); + // Reports on-demand VM lifecycle diagnostics. Does not bring the VM up or count as activity. + HRESULT GetVmDiagnostics([out] WSLCVmDiagnostics* Diagnostics); + // Image management. HRESULT PullImage([in] LPCSTR Image, [in, unique] LPCSTR RegistryAuthenticationInformation, [in, unique] IProgressCallback* ProgressCallback, [in, unique] IWarningCallback* WarningCallback); HRESULT BuildImage([in] const WSLCBuildImageOptions* Options, [in, unique] IProgressCallback* ProgressCallback, [in, unique, system_handle(sh_event)] HANDLE CancelEvent); @@ -832,7 +847,9 @@ interface IWSLCSession : IUnknown // Returns a handle to this COM server process (used to add to job object). HRESULT GetProcessHandle([out, system_handle(sh_process)] HANDLE* ProcessHandle); - // Initializes the session with a VM factory. VMs are created through the factory. + // Initializes the session. The session creates VMs on demand via the supplied + // factory: a VM is brought up when work requires it and idle-terminated when there + // are no running containers and no in-flight operations. HRESULT Initialize( [in] const WSLCSessionInitSettings* Settings, [in] IWSLCVirtualMachineFactory* VmFactory, diff --git a/src/windows/wslcsession/IORelay.cpp b/src/windows/wslcsession/IORelay.cpp index 6677bca87a..6eec63123b 100644 --- a/src/windows/wslcsession/IORelay.cpp +++ b/src/windows/wslcsession/IORelay.cpp @@ -68,6 +68,11 @@ void IORelay::Stop() } } +bool IORelay::IsRelayThread() const noexcept +{ + return m_thread.get_id() == std::this_thread::get_id(); +} + void IORelay::Run() try { diff --git a/src/windows/wslcsession/IORelay.h b/src/windows/wslcsession/IORelay.h index 879d3fee13..844c16dee6 100644 --- a/src/windows/wslcsession/IORelay.h +++ b/src/windows/wslcsession/IORelay.h @@ -30,6 +30,12 @@ class IORelay void Stop(); + // Returns true if the calling thread is the IORelay's own worker thread (i.e. the call + // is being made from a handle callback). Destroying the IORelay from this thread would + // join the thread with itself and call std::terminate(), so callers that may run on the + // relay thread must check this before destroying the object. + bool IsRelayThread() const noexcept; + private: void Start(); void Run(); diff --git a/src/windows/wslcsession/WSLCContainer.cpp b/src/windows/wslcsession/WSLCContainer.cpp index 7da28ca63d..8de0c11ba1 100644 --- a/src/windows/wslcsession/WSLCContainer.cpp +++ b/src/windows/wslcsession/WSLCContainer.cpp @@ -2129,6 +2129,10 @@ __requires_lock_held(m_lock) void WSLCContainerImpl::Transition(WSLCContainerSta m_state = State; m_stateChangedAt = stateChangedAt.value_or(static_cast(std::time(nullptr))); + + // A container transitioning to a terminal state (e.g. Exited) may leave the session idle. + // Ask the session to re-evaluate whether the VM can be torn down. This is a non-blocking signal. + m_wslcSession.RequestIdleCheck(); } WSLCContainer::WSLCContainer(WSLCContainerImpl* impl, WSLCSession& session, std::function&& OnDeleted) : diff --git a/src/windows/wslcsession/WSLCSession.cpp b/src/windows/wslcsession/WSLCSession.cpp index 642804cbdc..2e6e988aa3 100644 --- a/src/windows/wslcsession/WSLCSession.cpp +++ b/src/windows/wslcsession/WSLCSession.cpp @@ -296,7 +296,7 @@ HRESULT WSLCSession::Initialize( try { RETURN_HR_IF(E_POINTER, Settings == nullptr || VmFactory == nullptr); - RETURN_HR_IF(HRESULT_FROM_WIN32(ERROR_ALREADY_INITIALIZED), m_virtualMachine.has_value()); + RETURN_HR_IF(HRESULT_FROM_WIN32(ERROR_ALREADY_INITIALIZED), m_vmFactory != nullptr); THROW_HR_IF_MSG( E_INVALIDARG, WI_IsAnyFlagSet(Settings->FeatureFlags, ~WSLCFeatureFlagsValid), "Invalid feature flags: 0x%x", Settings->FeatureFlags); @@ -307,7 +307,7 @@ try Settings->StorageFlags); // Set up a warning context for the duration of initialization so that non-fatal - // failures (e.g., container/volume/network recovery) are streamed to the CLI. + // failures are streamed to the CLI. WSLCExecutionContext warningContext(this, WarningCallback); // N.B. No locking is required because Initialize() is always called before the session is returned to the caller. @@ -316,9 +316,12 @@ try m_creatorProcessName = Settings->CreatorProcessName ? Settings->CreatorProcessName : L""; m_featureFlags = Settings->FeatureFlags; m_pluginNotifier = PluginNotifier; + m_vmFactory = VmFactory; - // Get user token for the current process + // Persist a deep copy of the settings (and the creating user's SID) required to + // (re)create the VM on demand. const auto tokenInfo = wil::get_token_information(GetCurrentProcessToken()); + PersistSettings(*Settings, tokenInfo->User.Sid); WSL_LOG( "SessionInitialized", @@ -326,58 +329,438 @@ try TraceLoggingValue(m_displayName.c_str(), "DisplayName"), TraceLoggingValue(m_creatorProcessName.c_str(), "CreatorProcess")); - // Create the VM through the factory. The VM produces crash events; the session multiplexes - // them out to any registered ICrashDumpCallback subscribers via OnCrashDumpWritten. + // The VM is created lazily on the first operation that requires it (see EnsureVmRunning) + // and torn down when the session becomes idle. Start the worker that performs idle teardown. + m_idleThread = std::thread([this]() { IdleWorker(); }); + + return S_OK; +} +CATCH_RETURN() + +void WSLCSession::PersistSettings(const WSLCSessionInitSettings& Settings, PSID UserSid) +{ + m_settings = Settings; + + // Repoint the string fields at storage owned by the session so they outlive the caller's buffers. + m_settings.DisplayName = m_displayName.c_str(); + + if (Settings.CreatorProcessName != nullptr) + { + m_settingsCreatorProcessName = Settings.CreatorProcessName; + m_settings.CreatorProcessName = m_settingsCreatorProcessName->c_str(); + } + else + { + m_settings.CreatorProcessName = nullptr; + } + + if (Settings.StoragePath != nullptr) + { + m_settingsStoragePath = Settings.StoragePath; + m_settings.StoragePath = m_settingsStoragePath->c_str(); + } + else + { + m_settings.StoragePath = nullptr; + } + + if (Settings.RootVhdTypeOverride != nullptr) + { + m_settingsRootVhdTypeOverride = Settings.RootVhdTypeOverride; + m_settings.RootVhdTypeOverride = m_settingsRootVhdTypeOverride->c_str(); + } + else + { + m_settings.RootVhdTypeOverride = nullptr; + } + + if (UserSid != nullptr) + { + const auto length = GetLengthSid(UserSid); + const auto* bytes = reinterpret_cast(UserSid); + m_userSid.assign(bytes, bytes + length); + } + else + { + m_userSid.clear(); + } +} + +bool WSLCSession::IdleTerminationEnabled() const noexcept +{ + // Only tear the VM down when there is persistent storage to recover from. A tmpfs-backed + // session would lose all image/container state on teardown, so its VM is kept alive once started. + return m_settings.StoragePath != nullptr; +} + +void WSLCSession::EnsureVmRunning() +{ + if (m_vmState.load() == VmState::Running) + { + return; + } + + auto lock = m_lock.lock_exclusive(); + + // Do not (re)start the VM once the session is terminating or has terminated. This also + // bounds VmLease's retry loop: a lease that races with Terminate() fails here instead of + // restarting a VM that is being permanently torn down. + THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), m_terminating.load() || m_sessionTerminatedEvent.is_signaled()); + + if (m_vmState.load() == VmState::Running) + { + return; + } + + StartVmLockHeld(); +} + +void WSLCSession::StartVmLockHeld() +{ + WI_ASSERT(m_vmState.load() != VmState::Running); + + WSL_LOG("WslcVmStarting", TraceLoggingValue(m_id, "SessionId")); + + m_vmState.store(VmState::Starting); + m_vmStopRequested.store(false); + + // Tear everything back down if bring-up fails partway through. + auto startCleanup = wil::scope_exit_log(WI_DIAGNOSTICS_INFO, [&]() { + TearDownVmLockHeld(); + m_vmState.store(VmState::None); + }); + + // Create a fresh IO relay for this VM instance. The previous one (if any) was stopped + // during teardown and cannot be restarted. + m_ioRelay.emplace(); + + // Create the VM via the factory. The VM produces crash events; the session multiplexes them + // out to any registered ICrashDumpCallback subscribers via OnCrashDumpWritten. wil::com_ptr vm; - THROW_IF_FAILED(VmFactory->CreateVirtualMachine(&vm)); + THROW_IF_FAILED(m_vmFactory->CreateVirtualMachine(&vm)); m_virtualMachine.emplace( vm.get(), - Settings, + &m_settings, m_sessionTerminatingEvent.get(), std::bind(&WSLCSession::OnCrashDumpWritten, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5)); - - // Make sure that everything is destroyed correctly if an exception is thrown. - auto errorCleanup = wil::scope_exit_log(WI_DIAGNOSTICS_INFO, [&]() { LOG_IF_FAILED(Terminate()); }); - m_virtualMachine->Initialize(); // Get an event from the service that is signaled when the VM exits. + m_vmExitedEvent.reset(); THROW_IF_FAILED(vm->GetTerminationEvent(&m_vmExitedEvent)); // Configure storage. - ConfigureStorage(*Settings, tokenInfo->User.Sid); + ConfigureStorage(m_settings, m_userSid.empty() ? nullptr : reinterpret_cast(m_userSid.data())); - // Launch containerd first + // Launch containerd first, then dockerd with the external containerd socket. StartContainerd(); - // Launch dockerd with external containerd socket + // Reset the readiness event before (re)starting dockerd so a stale signal from a prior + // VM instance is not observed. + m_dockerdReadyEvent.ResetEvent(); StartDockerd(); // Wait for dockerd to be ready before starting the event tracker. THROW_WIN32_IF_MSG( - ERROR_TIMEOUT, !m_dockerdReadyEvent.wait(Settings->BootTimeoutMs), "Timed out waiting for dockerd to start"); + ERROR_TIMEOUT, !m_dockerdReadyEvent.wait(m_settings.BootTimeoutMs), "Timed out waiting for dockerd to start"); auto [_, __, channel] = m_virtualMachine->Fork(WSLC_FORK::Thread); m_dockerClient.emplace(std::move(channel), m_virtualMachine->TerminatingEvent(), m_virtualMachine->VmId(), 10 * 1000); // Start the event tracker. - m_eventTracker.emplace(m_dockerClient.value(), *this, m_ioRelay); + m_eventTracker.emplace(m_dockerClient.value(), *this, *m_ioRelay); m_volumes.emplace(m_dockerClient.value(), m_virtualMachine.value(), m_eventTracker.value(), m_storageVhdPath.parent_path()); // Monitor for unexpected VM exit. - m_ioRelay.AddHandle(std::make_unique(m_vmExitedEvent.get(), std::bind(&WSLCSession::OnVmExited, this))); + m_ioRelay->AddHandle(std::make_unique(m_vmExitedEvent.get(), std::bind(&WSLCSession::OnVmExited, this))); // Recover any existing resources from storage. RecoverExistingNetworks(); RecoverExistingContainers(); - errorCleanup.release(); - return S_OK; + m_vmState.store(VmState::Running); + m_vmStartCount.fetch_add(1); + startCleanup.release(); + + WSL_LOG("WslcVmStarted", TraceLoggingValue(m_id, "SessionId")); +} + +void WSLCSession::StopVmLockHeld() +{ + if (m_vmState.load() != VmState::Running) + { + return; + } + + WSL_LOG("WslcVmIdleStop", TraceLoggingValue(m_id, "SessionId")); + + // Flag the teardown as intentional so VM/dockerd/containerd exit callbacks (which fire + // from the IO relay thread while we hold the lock) do not treat it as a crash. + m_vmStopRequested.store(true); + m_vmState.store(VmState::Stopping); + + TearDownVmLockHeld(); + + m_vmState.store(VmState::None); + m_vmStopRequested.store(false); +} + +void WSLCSession::TearDownVmLockHeld(bool CaptureTerminationReason) +{ + std::lock_guard containersLock(m_containersLock); + std::lock_guard networksLock(m_networksLock); + + m_containers.clear(); + m_volumes.reset(); + m_networks.clear(); + + // Stop the IO relay. + // This stops: + // - container state monitoring. + // - container init process relays + // - execs relays + // - container logs relays + if (m_ioRelay) + { + m_ioRelay->Stop(); + } + + { + std::lock_guard allocatedPortsLock(m_allocatedPortsLock); + m_allocatedPorts.clear(); + } + + m_eventTracker.reset(); + m_dockerClient.reset(); + + if (CaptureTerminationReason) + { + // Default: an explicit/graceful teardown is a shutdown (the VM is still alive and we are + // bringing it down). Overridden below if the VM exited on its own and recorded a cause. + m_terminationReason = WSLCVirtualMachineTerminationReasonShutdown; + } + + // Check if the VM has already exited (e.g., killed externally). + // If so, skip operations that require a live VM to avoid unnecessary waits. + // N.B. m_vmExitedEvent may be uninitialized if teardown runs before GetTerminationEvent() succeeds. + if (m_vmExitedEvent && m_vmExitedEvent.is_signaled()) + { + WSL_LOG("SkippingGracefulShutdown_VmDead", TraceLoggingValue(m_id, "SessionId")); + + // The VM exited on its own, so it recorded the cause. + if (CaptureTerminationReason && m_virtualMachine) + { + wil::unique_cotaskmem_string details; + LOG_IF_FAILED(m_virtualMachine->GetTerminationReason(&m_terminationReason, &details)); + m_terminationDetails = details ? details.get() : L""; + } + } + else if (m_virtualMachine) + { + m_virtualMachine->OnSessionTerminated(); + + // Stop dockerd first, then containerd (dockerd is a client of containerd). + // N.B. dockerd waits a couple seconds if there are any outstanding HTTP request sockets opened. + if (m_dockerdProcess.has_value()) + { + auto dockerdExitCode = StopProcess(m_dockerdProcess.value(), c_processTerminateTimeoutMs, c_processKillTimeoutMs); + WSL_LOG("DockerdExit", TraceLoggingValue(dockerdExitCode, "code")); + } + + if (m_containerdProcess.has_value()) + { + auto containerdExitCode = StopProcess(m_containerdProcess.value(), c_processTerminateTimeoutMs, c_processKillTimeoutMs); + WSL_LOG("ContainerdExit", TraceLoggingValue(containerdExitCode, "code")); + } + + // N.B. dockerd has exited by this point, so unmounting the VHD is safe since no container can be running. + try + { + m_virtualMachine->Unmount(c_containerdStorage); + } + CATCH_LOG(); + } + + m_dockerdProcess.reset(); + m_containerdProcess.reset(); + m_virtualMachine.reset(); + + // Destroy the (stopped) relay so the next StartVm can create a fresh one. + // + // N.B. The relay must NOT be destroyed from its own thread: ~IORelay joins the relay + // thread, and joining a thread from itself calls std::terminate(). This situation arises + // on the unexpected-VM-exit path, where OnVmExited() runs on the relay thread and drives + // Terminate() -> TearDownVmLockHeld(). In that (terminal) case the stopped relay and the + // VM-exit event it watches are left in place and destroyed later by ~WSLCSession on a + // different thread. On the idle-stop and external-terminate paths this runs on a non-relay + // thread, so both are destroyed here. (StartVmLockHeld resets m_vmExitedEvent before reuse.) + if (!m_ioRelay || !m_ioRelay->IsRelayThread()) + { + m_ioRelay.reset(); + m_vmExitedEvent.reset(); + } + + // Delete the ephemeral swap VHD now that the VM is gone. + if (!m_swapVhdPath.empty()) + { + LOG_IF_WIN32_BOOL_FALSE(DeleteFileW(m_swapVhdPath.c_str())); + m_swapVhdPath.clear(); + } +} + +bool WSLCSession::HasActiveContainerLockHeld() +{ + std::lock_guard containersLock(m_containersLock); + + // A container in the Created or Running state keeps the VM alive (it is non-terminal and + // may still be started/used). Exited containers do not. + return std::ranges::any_of(m_containers, [](const auto& entry) { + const auto state = entry.second->State(); + return state == WslcContainerStateCreated || state == WslcContainerStateRunning; + }); +} + +void WSLCSession::RequestIdleCheck() noexcept +{ + if (m_idleCheckEvent) + { + m_idleCheckEvent.SetEvent(); + } +} + +void WSLCSession::IdleWorker() +{ + const HANDLE handles[] = {m_idleCheckEvent.get(), m_sessionTerminatingEvent.get()}; + + for (;;) + { + const auto wait = WaitForMultipleObjects(ARRAYSIZE(handles), handles, FALSE, INFINITE); + if (wait != WAIT_OBJECT_0) + { + // Session is terminating (or the wait failed) — exit the worker. + break; + } + + m_idleCheckEvent.ResetEvent(); + + if (m_terminating.load()) + { + break; + } + + if (!IdleTerminationEnabled()) + { + continue; + } + + try + { + auto lock = m_lock.lock_exclusive(); + + if (m_terminating.load() || m_vmState.load() != VmState::Running) + { + continue; + } + + // If the VM's exit event is already signaled, the VM has crashed and OnVmExited() + // (running on the IO relay thread) is about to drive Terminate(). Do NOT tear the VM + // down here: StopVmLockHeld() would join the relay thread while this exclusive lock is + // held, and the relay thread can be simultaneously blocked acquiring that same lock + // inside Terminate()'s retry loop — a deadlock. Leave crash handling to the relay-thread + // path, whose Stop() self-join is a no-op and therefore cannot deadlock. + if (m_vmExitedEvent && m_vmExitedEvent.is_signaled()) + { + continue; + } + + // Keep the VM alive while any operation is in flight or any container is non-terminal. + if (m_activityCount.load() != 0 || HasActiveContainerLockHeld()) + { + continue; + } + + StopVmLockHeld(); + } + CATCH_LOG(); + } +} + +WSLCSession::VmLease WSLCSession::AcquireVmLease() +{ + return VmLease(*this); +} + +WSLCSession::VmLease::VmLease(WSLCSession& Session) : m_session(&Session) +{ + // Record an in-flight operation before bringing the VM up so the idle worker cannot tear + // it down between EnsureVmRunning() and acquiring the shared lock. + m_session->m_activityCount.fetch_add(1); + + auto countCleanup = wil::scope_exit([this]() { + m_session->m_activityCount.fetch_sub(1); + m_session = nullptr; + }); + + // The idle worker may complete a teardown in the window between EnsureVmRunning() and our + // shared-lock acquisition (it could have committed to the stop before our activity-count + // increment was visible). Our increment prevents any *future* idle teardown, so retry until + // we hold the shared lock with the VM running. This is bounded: once the increment is visible + // the idle worker will not stop the VM again, so at most one restart is needed. + // N.B. EnsureVmRunning() throws if the session has been terminated, which breaks the loop. + for (;;) + { + m_session->EnsureVmRunning(); + + m_lock = m_session->m_lock.lock_shared(); + + if (m_session->m_vmState.load() == VmState::Running) + { + break; + } + + m_lock.reset(); + } + + countCleanup.release(); +} + +WSLCSession::VmLease::VmLease(VmLease&& Other) noexcept : + m_session(std::exchange(Other.m_session, nullptr)), m_lock(std::move(Other.m_lock)) +{ +} + +WSLCSession::VmLease& WSLCSession::VmLease::operator=(VmLease&& Other) noexcept +{ + if (this != &Other) + { + if (m_session != nullptr) + { + m_lock.reset(); + m_session->m_activityCount.fetch_sub(1); + m_session->RequestIdleCheck(); + } + + m_session = std::exchange(Other.m_session, nullptr); + m_lock = std::move(Other.m_lock); + } + + return *this; +} + +WSLCSession::VmLease::~VmLease() +{ + if (m_session != nullptr) + { + // Release the shared lock before triggering the idle check so the idle worker can + // immediately take the exclusive lock if the session is now idle. + m_lock.reset(); + m_session->m_activityCount.fetch_sub(1); + m_session->RequestIdleCheck(); + } } -CATCH_RETURN() WSLCSession::~WSLCSession() { @@ -496,7 +879,7 @@ HRESULT WSLCSession::GetId(ULONG* Id) void WSLCSession::OnDockerdExited() { - if (!m_sessionTerminatingEvent.is_signaled()) + if (!m_sessionTerminatingEvent.is_signaled() && !m_vmStopRequested.load()) { WSL_LOG("UnexpectedDockerdExit", TraceLoggingValue(m_displayName.c_str(), "Name")); } @@ -504,7 +887,7 @@ void WSLCSession::OnDockerdExited() void WSLCSession::OnContainerdExited() { - if (!m_sessionTerminatingEvent.is_signaled()) + if (!m_sessionTerminatingEvent.is_signaled() && !m_vmStopRequested.load()) { WSL_LOG("UnexpectedContainerdExit", TraceLoggingValue(m_displayName.c_str(), "Name")); } @@ -512,6 +895,15 @@ void WSLCSession::OnContainerdExited() void WSLCSession::OnVmExited() { + // A teardown we initiated (idle shutdown) is in progress — the VM exit is expected and + // must not terminate the session. N.B. This runs on the IO relay thread; the flag is set + // under the exclusive lock before the relay is stopped, so it is visible here. + if (m_vmStopRequested.load()) + { + WSL_LOG("WslcVmExitedDuringStop", TraceLoggingValue(m_id, "SessionId")); + return; + } + WSL_LOG( "VmExited", TraceLoggingLevel(WINEVENT_LEVEL_WARNING), @@ -554,13 +946,13 @@ ServiceRunningProcess WSLCSession::StartProcess( auto process = launcher.Launch(*m_virtualMachine); - m_ioRelay.AddHandle(std::make_unique( + m_ioRelay->AddHandle(std::make_unique( process.GetStdHandle(1), [this, LogSource](const auto& data) { OnProcessLog(data, LogSource); }, false)); - m_ioRelay.AddHandle(std::make_unique( + m_ioRelay->AddHandle(std::make_unique( process.GetStdHandle(2), [this, LogSource](const auto& data) { OnProcessLog(data, LogSource); }, false)); - m_ioRelay.AddHandle(std::make_unique(process.GetExitEvent(), std::move(ExitCallback))); + m_ioRelay->AddHandle(std::make_unique(process.GetExitEvent(), std::move(ExitCallback))); return process; } @@ -733,7 +1125,7 @@ try auto [repo, tagOrDigest] = wslutil::ParseImage(Image); EnforceRegistryAllowlist(repo); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_dockerClient.has_value()); if (!tagOrDigest.has_value()) @@ -790,7 +1182,7 @@ try comCall = RegisterUserCOMCallback(); } - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_virtualMachine); @@ -1100,7 +1492,7 @@ try WSLCExecutionContext context(this, WarningCallback); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_dockerClient.has_value()); @@ -1126,7 +1518,7 @@ try THROW_HR_IF_MSG(E_INVALIDARG, !tagOrDigest.has_value(), "Expected tag for image import: %hs", ImageName); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_dockerClient.has_value()); @@ -1241,7 +1633,7 @@ try RETURN_HR_IF_NULL(E_POINTER, ImageNameOrID); RETURN_HR_IF(E_INVALIDARG, strlen(ImageNameOrID) > WSLC_MAX_IMAGE_NAME_LENGTH); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_dockerClient.has_value()); @@ -1274,7 +1666,7 @@ try names.emplace_back(ImageNames->Values[i]); } - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_dockerClient.has_value()); @@ -1352,7 +1744,7 @@ try filters = wsl::windows::common::wslutil::ParseKeyMultiValuePairs(Options->Filters, Options->FiltersCount); } - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_dockerClient.has_value()); @@ -1461,7 +1853,7 @@ try *DeletedImages = nullptr; *Count = 0; - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_dockerClient.has_value()); @@ -1535,7 +1927,7 @@ try RETURN_HR_IF_NULL(E_POINTER, Options->Tag); RETURN_HR_IF(E_INVALIDARG, strlen(Options->Repo) + strlen(Options->Tag) + 1 > WSLC_MAX_IMAGE_NAME_LENGTH); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_dockerClient.has_value()); @@ -1572,7 +1964,7 @@ try auto [repo, tagOrDigest] = wslutil::ParseImage(Image); EnforceRegistryAllowlist(repo); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_dockerClient.has_value()); auto requestContext = m_dockerClient->PushImage(repo, tagOrDigest, RegistryAuthenticationInformation); @@ -1593,7 +1985,7 @@ try *Output = nullptr; - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); RETURN_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_dockerClient.has_value()); *Output = wil::make_unique_ansistring(InspectImageLockHeld(ImageNameOrId).c_str()).release(); @@ -1641,7 +2033,7 @@ try *IdentityToken = nullptr; - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_dockerClient.has_value()); wil::unique_cotaskmem_ansistring token; @@ -1673,7 +2065,7 @@ try auto filters = wsl::windows::common::wslutil::ParseKeyMultiValuePairs(Filters, FiltersCount); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); RETURN_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_dockerClient.has_value()); docker_schema::PruneImageResult pruneResult; @@ -1734,7 +2126,7 @@ try "Invalid process flags: 0x%x", containerOptions->InitProcessOptions.Flags); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); auto result = wil::ResultFromException([&]() { CreateContainerImpl(containerOptions, Container); }); @@ -1810,7 +2202,7 @@ void WSLCSession::CreateContainerImpl(const WSLCContainerOptions* containerOptio std::bind(&WSLCSession::OnContainerDeleted, this, std::placeholders::_1), m_eventTracker.value(), m_dockerClient.value(), - m_ioRelay); + *m_ioRelay); // Key the map by Docker's container ID, which is set in the WSLCContainerImpl constructor and stable for its lifetime. auto [it, inserted] = m_containers.emplace(container->ID(), std::move(container)); @@ -1843,7 +2235,7 @@ try ValidateName(Id, WSLC_MAX_CONTAINER_NAME_LENGTH); // Look for an exact ID match first. - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); std::lock_guard containersLock{m_containersLock}; // Purge containers that were auto-deleted via OnEvent (--rm). @@ -1916,7 +2308,7 @@ try filters = wsl::windows::common::wslutil::ParseKeyMultiValuePairs(Options->Filters, Options->FiltersCount); } - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); RETURN_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_dockerClient.has_value()); std::vector dockerContainers; @@ -1995,7 +2387,7 @@ try auto filters = wsl::windows::common::wslutil::ParseKeyMultiValuePairs(Filters, FiltersCount); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); RETURN_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_dockerClient.has_value()); std::lock_guard containersLock{m_containersLock}; @@ -2066,7 +2458,7 @@ try *Errno = -1; // Make sure not to return 0 if something fails. } - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_virtualMachine); auto process = m_virtualMachine->CreateLinuxProcess(Executable, *Options, TtyRows, TtyColumns, Errno); @@ -2092,7 +2484,7 @@ try THROW_HR_WITH_USER_ERROR_IF(E_INVALIDARG, Localization::MessagePathNotAbsolute(Path), !std::filesystem::path(Path).is_absolute()); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_virtualMachine); // Attach the disk to the VM (AttachDisk() performs the access check for the VHD file). @@ -2120,7 +2512,7 @@ try auto driverOpts = wslutil::ParseKeyValuePairs(Options->DriverOpts, Options->DriverOptsCount); auto labels = wslutil::ParseKeyValuePairs(Options->Labels, Options->LabelsCount, WSLCVolumeMetadataLabel); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_volumes); if (Options->Name != nullptr && Options->Name[0] != '\0') @@ -2140,7 +2532,7 @@ try RETURN_HR_IF_NULL(E_POINTER, Name); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_volumes); m_volumes->DeleteVolume(Name); @@ -2161,7 +2553,7 @@ try auto filters = wsl::windows::common::wslutil::ParseKeyMultiValuePairs(Filters, FiltersCount); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_volumes); auto volumeList = m_volumes->ListVolumes(std::move(filters)); @@ -2193,7 +2585,7 @@ try std::string name = Name; ValidateName(name.c_str(), WSLC_MAX_VOLUME_NAME_LENGTH); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_volumes); std::string json = m_volumes->InspectVolume(name); @@ -2218,7 +2610,7 @@ try auto filters = wsl::windows::common::wslutil::ParseKeyMultiValuePairs(Filters, FiltersCount); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_volumes); WSLCVolumes::PruneVolumesResult pruneResult; @@ -2298,7 +2690,7 @@ try auto driverOpts = wslutil::ParseKeyValuePairs(Options->DriverOpts, Options->DriverOptsCount); auto labels = wslutil::ParseKeyValuePairs(Options->Labels, Options->LabelsCount, WSLCNetworkManagedLabel); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_dockerClient); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_virtualMachine); @@ -2399,7 +2791,7 @@ try std::string name = Name; ValidateName(name.c_str(), WSLC_MAX_NETWORK_NAME_LENGTH); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_dockerClient); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_virtualMachine); @@ -2439,7 +2831,7 @@ try *Networks = nullptr; *Count = 0; - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); std::lock_guard networksLock(m_networksLock); if (m_networks.empty()) @@ -2478,7 +2870,7 @@ try std::string name = Name; ValidateName(name.c_str(), WSLC_MAX_NETWORK_NAME_LENGTH); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); std::lock_guard networksLock(m_networksLock); auto it = m_networks.find(name); @@ -2589,90 +2981,29 @@ try // Acquire an exclusive lock to ensure that no operation is running. WI_VERIFY(sessionLock); - std::lock_guard containersLock(m_containersLock); - std::lock_guard networksLock(m_networksLock); - - m_containers.clear(); - m_volumes.reset(); - m_networks.clear(); - - // Stop the IO relay. - // This stops: - // - container state monitoring. - // - container init process relays - // - execs relays - // - container logs relays - m_ioRelay.Stop(); - - { - std::lock_guard allocatedPortsLock(m_allocatedPortsLock); - m_allocatedPorts.clear(); - } - - m_eventTracker.reset(); - m_dockerClient.reset(); - - // Check if the VM has already exited (e.g., killed externally). - // If so, skip operations that require a live VM to avoid unnecessary waits. - // N.B. m_vmExitedEvent may be uninitialized if Terminate() is called from the - // Initialize() error path before GetTerminationEvent() succeeds. - if (m_vmExitedEvent && m_vmExitedEvent.is_signaled()) - { - WSL_LOG("SkippingGracefulShutdown_VmDead", TraceLoggingValue(m_id, "SessionId")); - - // The VM exited on its own, so it recorded the cause. - if (m_virtualMachine) - { - wil::unique_cotaskmem_string details; - LOG_IF_FAILED(m_virtualMachine->GetTerminationReason(&m_terminationReason, &details)); - m_terminationDetails = details ? details.get() : L""; - } - } - else - { - // The VM is still alive, so this is a graceful shutdown initiated by us. - m_terminationReason = WSLCVirtualMachineTerminationReasonShutdown; - - if (m_virtualMachine) - { - m_virtualMachine->OnSessionTerminated(); + // Acquire an exclusive lock to ensure that no operation is running. + WI_VERIFY(sessionLock); - // Stop dockerd first, then containerd (dockerd is a client of containerd). - // N.B. dockerd waits a couple seconds if there are any outstanding HTTP request sockets opened. - if (m_dockerdProcess.has_value()) - { - auto dockerdExitCode = StopProcess(m_dockerdProcess.value(), c_processTerminateTimeoutMs, c_processKillTimeoutMs); - WSL_LOG("DockerdExit", TraceLoggingValue(dockerdExitCode, "code")); - } + // Tear down the VM (if running) and all VM-scoped state, capturing the termination reason. + // This mirrors the soft teardown used for idle shutdown, but here it is permanent. + TearDownVmLockHeld(/* CaptureTerminationReason */ true); - if (m_containerdProcess.has_value()) - { - auto containerdExitCode = StopProcess(m_containerdProcess.value(), c_processTerminateTimeoutMs, c_processKillTimeoutMs); - WSL_LOG("ContainerdExit", TraceLoggingValue(containerdExitCode, "code")); - } + m_vmState.store(VmState::None); - // N.B. dockerd has exited by this point, so unmounting the VHD is safe since no container can be running. - try - { - m_virtualMachine->Unmount(c_containerdStorage); - } - CATCH_LOG(); - } - } + // Signal completion last so any observer of the terminated event sees a fully torn-down + // session and a populated termination reason. + m_sessionTerminatedEvent.SetEvent(); - m_dockerdProcess.reset(); - m_containerdProcess.reset(); - m_virtualMachine.reset(); + // Release the exclusive lock before joining the idle worker. If the worker is currently + // blocked acquiring the exclusive lock (about to evaluate idle teardown), it must be able + // to obtain it, observe m_terminating, and exit — otherwise the join below would deadlock. + sessionLock.reset(); - // Delete the ephemeral swap VHD now that the VM is gone. - if (!m_swapVhdPath.empty()) + if (m_idleThread.joinable()) { - LOG_IF_WIN32_BOOL_FALSE(DeleteFileW(m_swapVhdPath.c_str())); - m_swapVhdPath.clear(); + m_idleThread.join(); } - m_sessionTerminatedEvent.SetEvent(); - return S_OK; } CATCH_RETURN(); @@ -2740,7 +3071,7 @@ try RETURN_HR_IF_NULL(E_POINTER, WindowsPath); RETURN_HR_IF_NULL(E_POINTER, LinuxPath); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_virtualMachine); return m_virtualMachine->MountWindowsFolder(WindowsPath, LinuxPath, ReadOnly); @@ -2754,7 +3085,7 @@ try RETURN_HR_IF_NULL(E_POINTER, LinuxPath); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_virtualMachine); return m_virtualMachine->UnmountWindowsFolder(LinuxPath); @@ -2766,7 +3097,7 @@ try { WSLCExecutionContext context(this); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_virtualMachine); std::lock_guard allocatedPortsLock(m_allocatedPortsLock); @@ -2812,7 +3143,7 @@ try { WSLCExecutionContext context(this); - auto lock = m_lock.lock_shared(); + auto lock = AcquireVmLease(); THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_virtualMachine); std::lock_guard allocatedPortsLock(m_allocatedPortsLock); @@ -2960,6 +3291,8 @@ void WSLCSession::CancelUserCOMCallbacks() void WSLCSession::OnContainerDeleted(const WSLCContainerImpl* Container) { + // N.B. This is an internal callback (invoked while a container is being deleted); it must + // not bring up the VM via AcquireVmLease(). It only removes bookkeeping under the shared lock. auto lock = m_lock.lock_shared(); std::lock_guard containersLock(m_containersLock); @@ -3007,6 +3340,17 @@ try } CATCH_RETURN(); +HRESULT WSLCSession::GetVmDiagnostics(_Out_ WSLCVmDiagnostics* Diagnostics) +{ + RETURN_HR_IF_NULL(E_POINTER, Diagnostics); + + // Reads atomics only: this must not acquire a VM lease or otherwise bring the VM up, + // so callers can observe idle termination without keeping the VM alive. + Diagnostics->Running = m_vmState.load() == VmState::Running; + Diagnostics->StartCount = m_vmStartCount.load(); + return S_OK; +} + void WSLCSession::RecoverExistingContainers() { WI_ASSERT(m_dockerClient.has_value()); @@ -3028,7 +3372,7 @@ void WSLCSession::RecoverExistingContainers() std::bind(&WSLCSession::OnContainerDeleted, this, std::placeholders::_1), m_eventTracker.value(), m_dockerClient.value(), - m_ioRelay); + *m_ioRelay); auto [it, inserted] = m_containers.emplace(container->ID(), std::move(container)); WI_ASSERT(inserted); diff --git a/src/windows/wslcsession/WSLCSession.h b/src/windows/wslcsession/WSLCSession.h index d7dd3db53f..00165576f9 100644 --- a/src/windows/wslcsession/WSLCSession.h +++ b/src/windows/wslcsession/WSLCSession.h @@ -22,7 +22,10 @@ Module Name: #include "DockerEventTracker.h" #include "DockerHTTPClient.h" #include "IORelay.h" +#include #include +#include +#include #include namespace wsl::windows::service::wslc { @@ -100,6 +103,7 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession IFACEMETHOD(GetState)(_Out_ WSLCSessionState* State) override; IFACEMETHOD(GetTerminationEvent)(_Out_ HANDLE* Event) override; IFACEMETHOD(GetTerminationReason)(_Out_ WSLCVirtualMachineTerminationReason* Reason, _Out_ LPWSTR* Details) override; + IFACEMETHOD(GetVmDiagnostics)(_Out_ WSLCVmDiagnostics* Diagnostics) override; // Image management. IFACEMETHOD(PullImage)( @@ -215,9 +219,58 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession bool WaitForEventOrSessionTerminating(HANDLE Event, std::chrono::milliseconds Timeout) const; + // Signals the idle worker to re-evaluate whether the VM can be torn down. + // Safe to call from any thread, including IO relay / container callbacks. + void RequestIdleCheck() noexcept; + private: ULONG m_id = 0; + // VM lifecycle state for on-demand creation / idle termination. + enum class VmState + { + None, + Starting, + Running, + Stopping, + }; + + _Requires_exclusive_lock_held_(m_lock) + void StartVmLockHeld(); + _Requires_exclusive_lock_held_(m_lock) + void StopVmLockHeld(); + _Requires_exclusive_lock_held_(m_lock) + void TearDownVmLockHeld(bool CaptureTerminationReason = false); + _Requires_exclusive_lock_held_(m_lock) + bool HasActiveContainerLockHeld(); + void EnsureVmRunning(); + void IdleWorker(); + bool IdleTerminationEnabled() const noexcept; + void PersistSettings(const WSLCSessionInitSettings& Settings, PSID UserSid); + + // RAII lease taken at the top of every VM-requiring operation. On construction it + // ensures the VM is running and records an in-flight operation so idle teardown is + // deferred; it then holds the shared session lock for the operation's duration. On + // destruction it releases the lock and triggers an idle check. + class VmLease + { + public: + VmLease() = default; + explicit VmLease(WSLCSession& Session); + VmLease(VmLease&& Other) noexcept; + VmLease& operator=(VmLease&& Other) noexcept; + ~VmLease(); + + VmLease(const VmLease&) = delete; + VmLease& operator=(const VmLease&) = delete; + + private: + WSLCSession* m_session{}; + wil::rwlock_release_shared_scope_exit m_lock; + }; + + [[nodiscard]] VmLease AcquireVmLease(); + __requires_lock_held(m_userHandlesLock) void CancelUserHandleIO(); __requires_lock_held(m_userCOMCallbacksLock) void CancelUserCOMCallbacks(); @@ -255,6 +308,7 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession void StreamImageOperation(DockerHTTPClient::HTTPRequestContext& requestContext, LPCSTR Image, LPCSTR OperationName, IProgressCallback* ProgressCallback); std::optional m_dockerClient; + wil::com_ptr m_vmFactory; std::optional m_virtualMachine; std::optional m_eventTracker; wil::unique_event m_dockerdReadyEvent{wil::EventOptions::ManualReset}; @@ -279,7 +333,26 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession WSLCVirtualMachineTerminationReason m_terminationReason{WSLCVirtualMachineTerminationReasonUnknown}; std::wstring m_terminationDetails; wil::srwlock m_lock; - IORelay m_ioRelay; + std::optional m_ioRelay; + + // VM lifecycle / idle-termination state. + std::atomic m_vmState{VmState::None}; + std::atomic m_activityCount{0}; + std::atomic m_vmStopRequested{false}; + // Number of times the VM has been (re)created; surfaced via GetVmDiagnostics. + std::atomic m_vmStartCount{0}; + wil::unique_event m_idleCheckEvent{wil::EventOptions::ManualReset}; + std::thread m_idleThread; + + // Persisted settings required to (re)create the VM on demand. The string fields point + // into the owned storage members below (or m_displayName) so they remain valid for the + // lifetime of the session. + WSLCSessionInitSettings m_settings{}; + std::optional m_settingsCreatorProcessName; + std::optional m_settingsStoragePath; + std::optional m_settingsRootVhdTypeOverride; + std::vector m_userSid; + std::optional m_containerdProcess; std::optional m_dockerdProcess; WSLCFeatureFlags m_featureFlags{}; diff --git a/test/windows/wslc/e2e/WSLCE2EHelpers.h b/test/windows/wslc/e2e/WSLCE2EHelpers.h index d05940c58e..79c6d2f8ff 100644 --- a/test/windows/wslc/e2e/WSLCE2EHelpers.h +++ b/test/windows/wslc/e2e/WSLCE2EHelpers.h @@ -111,6 +111,11 @@ struct TestSession return m_storagePath; } + IWSLCSession* Session() const + { + return m_session.get(); + } + private: std::wstring m_name; std::filesystem::path m_storagePath; diff --git a/test/windows/wslc/e2e/WSLCE2EVmIdleTests.cpp b/test/windows/wslc/e2e/WSLCE2EVmIdleTests.cpp new file mode 100644 index 0000000000..cf8783db8e --- /dev/null +++ b/test/windows/wslc/e2e/WSLCE2EVmIdleTests.cpp @@ -0,0 +1,143 @@ +/*++ + +Copyright (c) Microsoft. All rights reserved. + +Module Name: + + WSLCE2EVmIdleTests.cpp + +Abstract: + + End-to-end tests for on-demand / idle-terminating wslc VMs. A session's backing VM is + created lazily on the first VM-requiring operation and torn down again once there are no + active (Created or Running) containers and no in-flight operations, while the per-user + session survives across VM restarts. VM lifecycle is observed via + IWSLCSession::GetVmDiagnostics, which reads state without bringing the VM up. + +--*/ + +#include "precomp.h" +#include "windows/Common.h" +#include "WSLCExecutor.h" +#include "WSLCE2EHelpers.h" + +namespace WSLCE2ETests { +using namespace wsl::shared; + +class WSLCE2EVmIdleTests +{ + WSLC_TEST_CLASS(WSLCE2EVmIdleTests) + + const TestImage& AlpineImage = AlpineTestImage(); + + static WSLCVmDiagnostics QueryDiagnostics(const TestSession& session) + { + WSLCVmDiagnostics diagnostics{}; + VERIFY_SUCCEEDED(session.Session()->GetVmDiagnostics(&diagnostics)); + return diagnostics; + } + + // Polls VM diagnostics until the VM reaches the desired running state. Idle teardown and + // on-demand bring-up happen asynchronously, so callers must wait rather than assume. + static void WaitForVmRunningState(const TestSession& session, bool running) + { + retry::RetryWithTimeout( + [&]() { + const auto diagnostics = QueryDiagnostics(session); + THROW_HR_IF(E_FAIL, static_cast(diagnostics.Running) != running); + }, + std::chrono::milliseconds(250), + std::chrono::seconds(60)); + } + + // A freshly created session has no VM until the first VM-requiring operation arrives, and + // the VM idle-terminates once that operation completes with nothing left active. + WSLC_TEST_METHOD(WSLCE2E_VmIdle_LazyStartAndIdleStop) + { + auto session = TestSession::Create(L"wslc-vmidle-lazy"); + + const auto initial = QueryDiagnostics(session); + VERIFY_IS_FALSE(static_cast(initial.Running)); + VERIFY_ARE_EQUAL(initial.StartCount, 0ul); + + // The first VM-requiring operation brings the VM up on demand. + EnsureImageIsLoaded(AlpineImage, session.Name()); + + // With no Created/Running containers and no in-flight operations, the VM tears down. + WaitForVmRunningState(session, false); + + const auto afterIdle = QueryDiagnostics(session); + VERIFY_IS_TRUE(afterIdle.StartCount >= 1ul); + } + + // After the VM idle-terminates, a subsequent operation recreates it from scratch and any + // previously loaded images remain available (storage persists across VM restarts). + WSLC_TEST_METHOD(WSLCE2E_VmIdle_RecreateOnDemandAndPersistState) + { + auto session = TestSession::Create(L"wslc-vmidle-recreate"); + + EnsureImageIsLoaded(AlpineImage, session.Name()); + WaitForVmRunningState(session, false); + const auto startCountBeforeRecreate = QueryDiagnostics(session).StartCount; + + // Running a container recreates the VM, runs to completion, then idles again. + RunWslcAndVerify( + std::format(L"container run --session {} --rm {} echo hello", session.Name(), AlpineImage.NameAndTag()), + {.Stderr = L"", .ExitCode = 0}); + + WaitForVmRunningState(session, false); + const auto startCountAfterRecreate = QueryDiagnostics(session).StartCount; + VERIFY_IS_TRUE(startCountAfterRecreate > startCountBeforeRecreate); + + // The image loaded before the restart survived the VM teardown/recreate cycle. + auto images = RunWslc(std::format(L"image list --session {}", session.Name())); + images.Verify({.Stderr = L"", .ExitCode = 0}); + VERIFY_IS_TRUE(images.StdoutContainsSubstring(L"alpine")); + } + + // A container in the Created state (created but never started) counts as active and keeps + // the VM alive; removing it lets the VM idle-terminate. + WSLC_TEST_METHOD(WSLCE2E_VmIdle_CreatedContainerKeepsVmAlive) + { + constexpr auto containerName = L"wslc-vmidle-created"; + auto session = TestSession::Create(L"wslc-vmidle-created-session"); + + EnsureImageIsLoaded(AlpineImage, session.Name()); + + RunWslcAndVerify( + std::format(L"container create --session {} --name {} {} sleep 3600", session.Name(), containerName, AlpineImage.NameAndTag()), + {.Stderr = L"", .ExitCode = 0}); + + // The VM must stay up while a Created container exists, even with nothing running. + WaitForVmRunningState(session, true); + std::this_thread::sleep_for(std::chrono::seconds(3)); + VERIFY_IS_TRUE(static_cast(QueryDiagnostics(session).Running)); + + // Removing the only container drops the active count to zero and the VM idles. + RunWslcAndVerify( + std::format(L"container rm --session {} {}", session.Name(), containerName), {.Stderr = L"", .ExitCode = 0}); + + WaitForVmRunningState(session, false); + } + + // Stress the lease-vs-idle-teardown race: each run idles the VM, and the next run's lease + // arrives while teardown may still be in flight. All operations must succeed (no spurious + // ERROR_INVALID_STATE from racing a VM that is stopping). + WSLC_TEST_METHOD(WSLCE2E_VmIdle_ConcurrentRecreateDoesNotFail) + { + auto session = TestSession::Create(L"wslc-vmidle-stress"); + + EnsureImageIsLoaded(AlpineImage, session.Name()); + + for (int i = 0; i < 12; i++) + { + // Intentionally do not wait between iterations so each lease races the previous + // run's idle teardown. + auto result = RunWslc( + std::format(L"container run --session {} --rm {} echo iteration-{}", session.Name(), AlpineImage.NameAndTag(), i)); + result.Verify({.Stderr = L"", .ExitCode = 0}); + } + } +}; + +} // namespace WSLCE2ETests From 69f53af9a623e6b047e60c4f0ca1010620c68b10 Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Wed, 10 Jun 2026 12:38:30 -0700 Subject: [PATCH 07/22] wslcsession: fix idle-VM runtime races (factory proxy, exit code, rm deadlock) Runtime fixes for the idle-terminate feature so all four WSLCE2EVmIdleTests pass: - Register the IWSLCVirtualMachineFactory proxy in the process Global Interface Table and re-fetch it per VM creation, and MTA-init the IdleWorker / IORelay threads, so on-demand VM creation no longer fails with RPC_E_WRONG_THREAD. - Register the factory IID in the MSIX so cross-proc marshalling resolves. - Preserve a container's real exit code in DockerContainerProcessControl:: OnContainerReleased: only synthesize 128+SIGKILL when no exit code was ever recorded, so --rm 'container run' returns 0 instead of 137 when the VM idle-terminates immediately after the container exits. - Fix an AB-BA deadlock between 'container rm' and idle teardown: hold a VmLease across WSLCContainer::Delete (keeps the VM up and blocks teardown) and drop the now-redundant shared-lock re-acquire in OnContainerDeleted (which would deadlock behind the idle worker's pending exclusive lock). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/windows/wslcsession/IORelay.cpp | 4 ++ src/windows/wslcsession/WSLCContainer.cpp | 5 +++ .../wslcsession/WSLCProcessControl.cpp | 10 ++++- src/windows/wslcsession/WSLCSession.cpp | 40 +++++++++++++++---- src/windows/wslcsession/WSLCSession.h | 13 +++++- 5 files changed, 62 insertions(+), 10 deletions(-) diff --git a/src/windows/wslcsession/IORelay.cpp b/src/windows/wslcsession/IORelay.cpp index 6eec63123b..09f5e05cd4 100644 --- a/src/windows/wslcsession/IORelay.cpp +++ b/src/windows/wslcsession/IORelay.cpp @@ -78,6 +78,10 @@ try { common::wslutil::SetThreadDescription(L"IORelay"); + // Handle callbacks dispatched from this thread (e.g. unexpected VM exit) can tear the VM down, + // releasing cross-process COM proxies, so join the process MTA to avoid RPC_E_WRONG_THREAD. + const auto coInit = wil::CoInitializeEx(COINIT_MULTITHREADED); + windows::common::io::MultiHandleWait io; // N.B. All the IO must happen on the thread. diff --git a/src/windows/wslcsession/WSLCContainer.cpp b/src/windows/wslcsession/WSLCContainer.cpp index 8de0c11ba1..fd7638fd4e 100644 --- a/src/windows/wslcsession/WSLCContainer.cpp +++ b/src/windows/wslcsession/WSLCContainer.cpp @@ -2278,6 +2278,11 @@ try THROW_HR_IF_MSG(E_INVALIDARG, WI_IsAnyFlagSet(Flags, ~WSLCDeleteFlagsValid), "Invalid flags: 0x%x", Flags); // Special case for Delete(): If deletion is successful, notify the WSLCSession that the container has been deleted. + // Hold a VM lease across the whole operation: deleting a container makes it inactive and + // can trigger an idle teardown. Without the lease the idle worker could take the session + // lock exclusively and clear m_containers (destroying this container) concurrently, racing + // the delete and inverting the container->session lock order. + auto vmLease = m_session.AcquireVmLease(); auto [lock, impl] = LockImpl(); impl->Delete(Flags); diff --git a/src/windows/wslcsession/WSLCProcessControl.cpp b/src/windows/wslcsession/WSLCProcessControl.cpp index 04e8ff2f8a..4e6da4e045 100644 --- a/src/windows/wslcsession/WSLCProcessControl.cpp +++ b/src/windows/wslcsession/WSLCProcessControl.cpp @@ -101,7 +101,15 @@ void DockerContainerProcessControl::OnContainerReleased() noexcept // Signal the exit event to prevent callers from being blocked on it. if (!m_exitEvent.is_signaled()) { - m_exitedCode = 128 + WSLCSignalSIGKILL; + // If the container already produced a real exit code (recorded by SetExitCode but not yet + // signaled — e.g. an --rm container whose init-exit signal is deferred to the Destroy + // event), preserve it. Only synthesize SIGKILL when the container is released without ever + // having produced an exit code (an abrupt teardown of a still-running container). + if (!m_exitedCode.has_value()) + { + m_exitedCode = 128 + WSLCSignalSIGKILL; + } + m_exitEvent.SetEvent(); } } diff --git a/src/windows/wslcsession/WSLCSession.cpp b/src/windows/wslcsession/WSLCSession.cpp index 2e6e988aa3..a4177de38f 100644 --- a/src/windows/wslcsession/WSLCSession.cpp +++ b/src/windows/wslcsession/WSLCSession.cpp @@ -296,7 +296,7 @@ HRESULT WSLCSession::Initialize( try { RETURN_HR_IF(E_POINTER, Settings == nullptr || VmFactory == nullptr); - RETURN_HR_IF(HRESULT_FROM_WIN32(ERROR_ALREADY_INITIALIZED), m_vmFactory != nullptr); + RETURN_HR_IF(HRESULT_FROM_WIN32(ERROR_ALREADY_INITIALIZED), m_vmFactoryGitCookie != 0); THROW_HR_IF_MSG( E_INVALIDARG, WI_IsAnyFlagSet(Settings->FeatureFlags, ~WSLCFeatureFlagsValid), "Invalid feature flags: 0x%x", Settings->FeatureFlags); @@ -316,7 +316,12 @@ try m_creatorProcessName = Settings->CreatorProcessName ? Settings->CreatorProcessName : L""; m_featureFlags = Settings->FeatureFlags; m_pluginNotifier = PluginNotifier; - m_vmFactory = VmFactory; + + // Park the VM factory in the Global Interface Table. It is supplied here (on the call that + // creates the session) but used on demand from other threads/apartments; storing the raw + // proxy and calling it later would raise RPC_E_WRONG_THREAD. + m_git = wil::CoCreateInstance(CLSID_StdGlobalInterfaceTable, CLSCTX_INPROC_SERVER); + THROW_IF_FAILED(m_git->RegisterInterfaceInGlobal(VmFactory, __uuidof(IWSLCVirtualMachineFactory), &m_vmFactoryGitCookie)); // Persist a deep copy of the settings (and the creating user's SID) required to // (re)create the VM on demand. @@ -434,10 +439,15 @@ void WSLCSession::StartVmLockHeld() // during teardown and cannot be restarted. m_ioRelay.emplace(); - // Create the VM via the factory. The VM produces crash events; the session multiplexes them - // out to any registered ICrashDumpCallback subscribers via OnCrashDumpWritten. + // Create the VM via the factory. Re-fetch the factory from the GIT so we call it through a + // proxy marshalled into this thread's apartment (see m_git). The VM produces crash events; + // the session multiplexes them out to any registered ICrashDumpCallback subscribers via + // OnCrashDumpWritten. + wil::com_ptr vmFactory; + THROW_IF_FAILED(m_git->GetInterfaceFromGlobal(m_vmFactoryGitCookie, __uuidof(IWSLCVirtualMachineFactory), vmFactory.put_void())); + wil::com_ptr vm; - THROW_IF_FAILED(m_vmFactory->CreateVirtualMachine(&vm)); + THROW_IF_FAILED(vmFactory->CreateVirtualMachine(&vm)); m_virtualMachine.emplace( vm.get(), @@ -633,6 +643,10 @@ void WSLCSession::RequestIdleCheck() noexcept void WSLCSession::IdleWorker() { + // Idle teardown releases cross-process COM proxies (the VM and its VM-scoped state), so this + // thread must join the process MTA; otherwise those Release/calls fail with RPC_E_WRONG_THREAD. + const auto coInit = wil::CoInitializeEx(COINIT_MULTITHREADED); + const HANDLE handles[] = {m_idleCheckEvent.get(), m_sessionTerminatingEvent.get()}; for (;;) @@ -3004,6 +3018,14 @@ try m_idleThread.join(); } + // The idle worker has exited and no operation can run past m_terminated, so the parked VM + // factory can no longer be re-fetched; revoke it from the GIT. + if (m_vmFactoryGitCookie != 0) + { + LOG_IF_FAILED(m_git->RevokeInterfaceFromGlobal(m_vmFactoryGitCookie)); + m_vmFactoryGitCookie = 0; + } + return S_OK; } CATCH_RETURN(); @@ -3291,9 +3313,11 @@ void WSLCSession::CancelUserCOMCallbacks() void WSLCSession::OnContainerDeleted(const WSLCContainerImpl* Container) { - // N.B. This is an internal callback (invoked while a container is being deleted); it must - // not bring up the VM via AcquireVmLease(). It only removes bookkeeping under the shared lock. - auto lock = m_lock.lock_shared(); + // N.B. Invoked only from WSLCContainer::Delete, which already holds a VmLease (the shared + // session lock). The lease prevents a concurrent idle teardown from clearing m_containers, + // so this only needs m_containersLock. It must NOT re-acquire the shared session lock here: + // doing so while the idle worker is queued for the exclusive lock would deadlock (recursive + // shared acquire behind a pending writer). std::lock_guard containersLock(m_containersLock); WI_VERIFY(m_containers.erase(Container->ID()) == 1); diff --git a/src/windows/wslcsession/WSLCSession.h b/src/windows/wslcsession/WSLCSession.h index 00165576f9..8afdaa79a0 100644 --- a/src/windows/wslcsession/WSLCSession.h +++ b/src/windows/wslcsession/WSLCSession.h @@ -78,6 +78,10 @@ class UserCOMCallback class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession : public Microsoft::WRL::RuntimeClass, IWSLCSession, IFastRundown, ISupportErrorInfo> { + // WSLCContainer::Delete acquires a VmLease to keep the VM alive (and block idle + // teardown) for the duration of a container deletion. + friend class WSLCContainer; + public: WSLCSession() = default; @@ -308,7 +312,14 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession void StreamImageOperation(DockerHTTPClient::HTTPRequestContext& requestContext, LPCSTR Image, LPCSTR OperationName, IProgressCallback* ProgressCallback); std::optional m_dockerClient; - wil::com_ptr m_vmFactory; + + // The VM factory is a cross-process proxy supplied by the SYSTEM service at Initialize() time + // but first used later (on demand) from a different thread/apartment. A directly stored proxy + // would fail with RPC_E_WRONG_THREAD, so it is parked in the process Global Interface Table and + // re-fetched (re-marshalled into the calling apartment) each time a VM is created. + wil::com_ptr m_git; + DWORD m_vmFactoryGitCookie{}; + std::optional m_virtualMachine; std::optional m_eventTracker; wil::unique_event m_dockerdReadyEvent{wil::EventOptions::ManualReset}; From d51eab1db9e23c0b4bd8623a46189e06c8b0ee8e Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Wed, 10 Jun 2026 15:04:12 -0700 Subject: [PATCH 08/22] wslcsession: keep VM alive across CLI container op via BeginContainerOperation The wslc CLI performs each container mutation as two COM round-trips (OpenContainer to resolve a wrapper, then the operation), and may stream output afterwards. With on-demand VM idle-termination enabled (any persistent-storage session), the VM could idle-stop in the gap between the calls when the target container is not Created/Running: TearDownVmLockHeld clears m_containers, disconnecting the client-held wrapper, so the second call failed with RPC_E_DISCONNECTED. This regressed the container CRUD E2E suite (rm of stopped containers, and cleanup helpers). Add IWSLCSession::BeginContainerOperation, returning an activity token that holds m_activityCount > 0 for as long as the client holds it. The CLI now holds the token across the whole operation (resolve + operate + streamed relay), so the idle worker cannot tear the VM down mid-operation. Releasing the token (or the client exiting, via fast rundown) lets the VM idle again. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/windows/service/inc/wslc.idl | 8 ++ .../wslc/services/ContainerService.cpp | 19 +++- src/windows/wslc/services/SessionModel.h | 10 ++ src/windows/wslcsession/WSLCSession.cpp | 103 +++++++++++++++--- src/windows/wslcsession/WSLCSession.h | 1 + 5 files changed, 123 insertions(+), 18 deletions(-) diff --git a/src/windows/service/inc/wslc.idl b/src/windows/service/inc/wslc.idl index 5d029a410d..525857e57c 100644 --- a/src/windows/service/inc/wslc.idl +++ b/src/windows/service/inc/wslc.idl @@ -823,6 +823,14 @@ interface IWSLCSession : IUnknown // Container management. HRESULT CreateContainer([in] const WSLCContainerOptions* Options, [in, unique] IWarningCallback* WarningCallback, [out] IWSLCContainer** Container); HRESULT OpenContainer([in, ref] LPCSTR Id, [out] IWSLCContainer** Container); + + // Keeps the VM alive for the duration of a client-side container operation. The CLI performs + // each mutation as two round-trips (OpenContainer followed by the operation) and may stream + // output afterwards. With on-demand VM idle-termination the VM could otherwise tear down + // between those calls, disconnecting the container wrapper and failing the second call with + // RPC_E_DISCONNECTED. The client holds the returned token for the whole operation; releasing + // it (or the client exiting) lets the VM idle-terminate again. + HRESULT BeginContainerOperation([out] IUnknown** Operation); HRESULT ListContainers([in, unique] const WSLCListContainersOptions* Options,[out, size_is(, *Count)] WSLCContainerEntry** Containers,[out] ULONG* Count, [out, size_is(, *PortsCount)] WSLCContainerPortMapping** Ports, [out] ULONG* PortsCount); HRESULT PruneContainers([in, unique, size_is(FiltersCount)] const WSLCFilter* Filters, [in] ULONG FiltersCount, [out] WSLCPruneContainersResults* Result); diff --git a/src/windows/wslc/services/ContainerService.cpp b/src/windows/wslc/services/ContainerService.cpp index 1015762ecc..49e3a250ba 100644 --- a/src/windows/wslc/services/ContainerService.cpp +++ b/src/windows/wslc/services/ContainerService.cpp @@ -285,6 +285,7 @@ std::wstring ContainerService::FormatRelativeTime(ULONGLONG timestamp) int ContainerService::Attach(Session& session, const std::string& id) { + auto operation = session.BeginContainerOperation(); wil::com_ptr container; THROW_IF_FAILED(session.Get()->OpenContainer(id.c_str(), &container)); @@ -446,6 +447,7 @@ CreateContainerResult ContainerService::Create(Session& session, const std::stri int ContainerService::Start(Session& session, const std::string& id, bool attach) { + auto operation = session.BeginContainerOperation(); wil::com_ptr container; THROW_IF_FAILED(session.Get()->OpenContainer(id.c_str(), &container)); WSLCContainerStartFlags flags = attach ? WSLCContainerStartFlagsAttach : WSLCContainerStartFlagsNone; @@ -476,6 +478,7 @@ int ContainerService::Start(Session& session, const std::string& id, bool attach void ContainerService::Stop(Session& session, const std::string& id, StopContainerOptions options) { + auto operation = session.BeginContainerOperation(); wil::com_ptr container; THROW_IF_FAILED(session.Get()->OpenContainer(id.c_str(), &container)); THROW_IF_FAILED_EXCEPT(container->Stop(options.Signal, options.Timeout), WSLC_E_CONTAINER_NOT_RUNNING); @@ -483,6 +486,7 @@ void ContainerService::Stop(Session& session, const std::string& id, StopContain void ContainerService::Kill(Session& session, const std::string& id, WSLCSignal signal) { + auto operation = session.BeginContainerOperation(); wil::com_ptr container; THROW_IF_FAILED(session.Get()->OpenContainer(id.c_str(), &container)); THROW_IF_FAILED(container->Kill(signal)); @@ -490,6 +494,7 @@ void ContainerService::Kill(Session& session, const std::string& id, WSLCSignal void ContainerService::Delete(Session& session, const std::string& id, bool force) { + auto operation = session.BeginContainerOperation(); wil::com_ptr container; THROW_IF_FAILED(session.Get()->OpenContainer(id.c_str(), &container)); THROW_IF_FAILED(container->Delete(force ? WSLCDeleteFlagsForce : WSLCDeleteFlagsNone)); @@ -544,6 +549,7 @@ std::vector ContainerService::List( int ContainerService::Exec(Session& session, const std::string& id, ContainerOptions options) { + auto operation = session.BeginContainerOperation(); wil::com_ptr container; THROW_IF_FAILED(session.Get()->OpenContainer(id.c_str(), &container)); @@ -575,6 +581,7 @@ int ContainerService::Exec(Session& session, const std::string& id, ContainerOpt InspectContainer ContainerService::Inspect(Session& session, const std::string& id) { + auto operation = session.BeginContainerOperation(); wil::com_ptr container; THROW_IF_FAILED(session.Get()->OpenContainer(id.c_str(), &container)); wil::unique_cotaskmem_ansistring output; @@ -584,6 +591,7 @@ InspectContainer ContainerService::Inspect(Session& session, const std::string& void ContainerService::Logs(Session& session, const std::string& id, bool follow, bool timestamps, ULONGLONG since, ULONGLONG until, ULONGLONG tail) { + auto operation = session.BeginContainerOperation(); wil::com_ptr container; THROW_IF_FAILED(session.Get()->OpenContainer(id.c_str(), &container)); @@ -596,13 +604,15 @@ void ContainerService::Logs(Session& session, const std::string& id, bool follow THROW_IF_FAILED(container->Logs(flags, &stdoutHandle, &stderrHandle, since, until, tail)); wsl::windows::common::io::MultiHandleWait io; - io.AddHandle(std::make_unique>( - stdoutHandle.Release(), GetStdHandle(STD_OUTPUT_HANDLE))); + io.AddHandle( + std::make_unique>( + stdoutHandle.Release(), GetStdHandle(STD_OUTPUT_HANDLE))); if (!stderrHandle.Empty()) // This handle is only used for non-tty processes. { - io.AddHandle(std::make_unique>( - stderrHandle.Release(), GetStdHandle(STD_ERROR_HANDLE))); + io.AddHandle( + std::make_unique>( + stderrHandle.Release(), GetStdHandle(STD_ERROR_HANDLE))); } // TODO: Handle ctrl-c. @@ -611,6 +621,7 @@ void ContainerService::Logs(Session& session, const std::string& id, bool follow wsl::windows::common::docker_schema::ContainerStats ContainerService::Stats(Session& session, const std::string& id) { + auto operation = session.BeginContainerOperation(); wil::com_ptr container; THROW_IF_FAILED(session.Get()->OpenContainer(id.c_str(), &container)); wil::unique_cotaskmem_ansistring output; diff --git a/src/windows/wslc/services/SessionModel.h b/src/windows/wslc/services/SessionModel.h index 7424a8c043..6111bf2b44 100644 --- a/src/windows/wslc/services/SessionModel.h +++ b/src/windows/wslc/services/SessionModel.h @@ -27,6 +27,16 @@ struct Session return m_session.get(); } + // Acquires an activity token that keeps the VM alive for the duration of a client-side + // container operation (resolve + operate, plus any streamed output). Hold the returned + // pointer for the whole operation; releasing it lets the VM idle-terminate again. + [[nodiscard]] wil::com_ptr BeginContainerOperation() const + { + wil::com_ptr operation; + THROW_IF_FAILED(m_session->BeginContainerOperation(&operation)); + return operation; + } + private: wil::com_ptr m_session; }; diff --git a/src/windows/wslcsession/WSLCSession.cpp b/src/windows/wslcsession/WSLCSession.cpp index a4177de38f..3275d38875 100644 --- a/src/windows/wslcsession/WSLCSession.cpp +++ b/src/windows/wslcsession/WSLCSession.cpp @@ -960,11 +960,13 @@ ServiceRunningProcess WSLCSession::StartProcess( auto process = launcher.Launch(*m_virtualMachine); - m_ioRelay->AddHandle(std::make_unique( - process.GetStdHandle(1), [this, LogSource](const auto& data) { OnProcessLog(data, LogSource); }, false)); + m_ioRelay->AddHandle( + std::make_unique( + process.GetStdHandle(1), [this, LogSource](const auto& data) { OnProcessLog(data, LogSource); }, false)); - m_ioRelay->AddHandle(std::make_unique( - process.GetStdHandle(2), [this, LogSource](const auto& data) { OnProcessLog(data, LogSource); }, false)); + m_ioRelay->AddHandle( + std::make_unique( + process.GetStdHandle(2), [this, LogSource](const auto& data) { OnProcessLog(data, LogSource); }, false)); m_ioRelay->AddHandle(std::make_unique(process.GetExitEvent(), std::move(ExitCallback))); @@ -1074,8 +1076,9 @@ void WSLCSession::StreamImageOperation(DockerHTTPClient::HTTPRequestContext& req auto onCompleted = [&]() { io.Cancel(); }; - io.AddHandle(std::make_unique( - requestContext, std::move(onHttpResponse), std::move(onChunk), std::move(onCompleted))); + io.AddHandle( + std::make_unique( + requestContext, std::move(onHttpResponse), std::move(onChunk), std::move(onCompleted))); io.Run({}); @@ -1247,8 +1250,9 @@ try auto io = CreateIOContext(); - io.AddHandle(std::make_unique>( - buildFileHandle.Get(), common::io::HandleWrapper{buildProcess.GetStdHandle(WSLCFDStdin)})); + io.AddHandle( + std::make_unique>( + buildFileHandle.Get(), common::io::HandleWrapper{buildProcess.GetStdHandle(WSLCFDStdin)})); bool verbose = WI_IsFlagSet(Options->Flags, WSLCBuildImageFlagsVerbose); std::string allOutput; @@ -1617,8 +1621,10 @@ void WSLCSession::ImportImageImpl(DockerHTTPClient::HTTPRequestContext& Request, LOG_LAST_ERROR_IF(shutdown(socket, SD_SEND) == SOCKET_ERROR); }; - io.AddHandle(std::make_unique>( - common::io::HandleWrapper{userHandle.Get(), std::move(onInputComplete)}, common::io::HandleWrapper{Request.stream.native_handle()})); + io.AddHandle( + std::make_unique>( + common::io::HandleWrapper{userHandle.Get(), std::move(onInputComplete)}, + common::io::HandleWrapper{Request.stream.native_handle()})); io.AddHandle( std::make_unique(Request, std::move(onHttpResponse), std::move(onProgress)), @@ -2288,6 +2294,73 @@ try } CATCH_RETURN(); +namespace { + + // Activity token returned by WSLCSession::BeginContainerOperation. While a client holds it, the + // session's activity count is non-zero, so the idle worker will not tear the VM down. It runs a + // release callback (which holds a strong reference to the session and decrements the activity + // count) when the client releases it or exits. It implements IFastRundown so that if the + // holding client crashes, COM reclaims the stub promptly (rather than via slow default rundown) + // and the VM can idle-terminate without a multi-minute delay. + class ContainerOperation + : public Microsoft::WRL::RuntimeClass, IUnknown, IFastRundown> + { + public: + // Adopts an activity-count reference already taken by BeginContainerOperation; the callback + // releases it. + void Initialize(std::function&& onRelease) noexcept + { + m_onRelease = std::move(onRelease); + } + + ~ContainerOperation() override + { + if (m_onRelease) + { + m_onRelease(); + } + } + + private: + std::function m_onRelease; + }; + +} // namespace + +HRESULT WSLCSession::BeginContainerOperation(IUnknown** Operation) +try +{ + WSLCExecutionContext context(this); + + RETURN_HR_IF_NULL(E_POINTER, Operation); + *Operation = nullptr; + + // Record the in-flight operation up front so the VM cannot idle-terminate before the client + // resolves the container and issues the operation (and streams any output). + m_activityCount.fetch_add(1); + auto countCleanup = wil::scope_exit([this]() { + m_activityCount.fetch_sub(1); + RequestIdleCheck(); + }); + + auto operation = Microsoft::WRL::Make(); + THROW_IF_NULL_ALLOC(operation.Get()); + + // Keep the session alive while the token is held so the release callback is always valid. + Microsoft::WRL::ComPtr self = this; + operation->Initialize([self = std::move(self)]() { + self->m_activityCount.fetch_sub(1); + self->RequestIdleCheck(); + }); + + // The token now owns the activity-count reference and will release it on destruction. + countCleanup.release(); + + RETURN_IF_FAILED(operation.CopyTo(Operation)); + return S_OK; +} +CATCH_RETURN(); + HRESULT WSLCSession::ListContainers( const WSLCListContainersOptions* Options, WSLCContainerEntry** Containers, ULONG* Count, WSLCContainerPortMapping** Ports, ULONG* PortsCount) try @@ -3203,12 +3276,14 @@ MultiHandleWait WSLCSession::CreateIOContext(HANDLE CancelHandle) io::MultiHandleWait io; // Cancel with E_ABORT if the session is terminating. - io.AddHandle(std::make_unique( - m_sessionTerminatingEvent.get(), [this]() { THROW_HR_MSG(E_ABORT, "Session %lu is terminating", m_id); })); + io.AddHandle(std::make_unique(m_sessionTerminatingEvent.get(), [this]() { + THROW_HR_MSG(E_ABORT, "Session %lu is terminating", m_id); + })); // Cancel with E_ABORT if the client process exits. - io.AddHandle(std::make_unique( - wslutil::OpenCallingProcess(SYNCHRONIZE), [this]() { THROW_HR_MSG(E_ABORT, "Client process has exited"); })); + io.AddHandle(std::make_unique(wslutil::OpenCallingProcess(SYNCHRONIZE), [this]() { + THROW_HR_MSG(E_ABORT, "Client process has exited"); + })); if (CancelHandle != nullptr) { diff --git a/src/windows/wslcsession/WSLCSession.h b/src/windows/wslcsession/WSLCSession.h index 8afdaa79a0..2cb745cc5c 100644 --- a/src/windows/wslcsession/WSLCSession.h +++ b/src/windows/wslcsession/WSLCSession.h @@ -145,6 +145,7 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession // Container management. IFACEMETHOD(CreateContainer)(_In_ const WSLCContainerOptions* Options, _In_opt_ IWarningCallback* WarningCallback, _Out_ IWSLCContainer** Container) override; IFACEMETHOD(OpenContainer)(_In_ LPCSTR Id, _In_ IWSLCContainer** Container) override; + IFACEMETHOD(BeginContainerOperation)(_Outptr_ IUnknown** Operation) override; IFACEMETHOD(ListContainers)( _In_opt_ const WSLCListContainersOptions* Options, _Out_ WSLCContainerEntry** Containers, From c3554ea952c199a5f5a5c8933c7ed96576b5ead7 Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Wed, 10 Jun 2026 16:04:54 -0700 Subject: [PATCH 09/22] wslcsession: add idle grace period before tearing down an idle VM Tearing the VM down the instant it went idle could thrash it (repeated teardown/recreate) when containers are created and destroyed, or operations issued, in quick succession. Keep an otherwise-idle VM running for a short grace period and only tear it down once it has stayed idle for the whole window. The idle worker now waits with a timeout derived from a grace deadline. The deadline is armed when the VM is first observed idle and reset on any non-idle observation or explicit idle-check signal (raised on every lease/token release and terminal container state change), so teardown occurs a full grace period after the last activity. A WAIT_TIMEOUT wake means the VM has been continuously idle for the grace period and is torn down. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/windows/wslcsession/WSLCSession.cpp | 62 +++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 5 deletions(-) diff --git a/src/windows/wslcsession/WSLCSession.cpp b/src/windows/wslcsession/WSLCSession.cpp index 3275d38875..2afc473ec2 100644 --- a/src/windows/wslcsession/WSLCSession.cpp +++ b/src/windows/wslcsession/WSLCSession.cpp @@ -37,6 +37,12 @@ constexpr auto c_dockerdReadyLogLine = "API listen on /var/run/docker.sock"; constexpr DWORD c_processTerminateTimeoutMs = 30 * 1000; constexpr DWORD c_processKillTimeoutMs = 10 * 1000; +// Grace period to keep an otherwise-idle VM running before tearing it down. This avoids +// thrashing the VM (repeated teardown/recreate) when containers are created and destroyed, +// or operations issued, in quick succession. The clock restarts whenever the VM is observed +// to be non-idle, so a full grace period of continuous idleness is required before teardown. +constexpr auto c_vmIdleGracePeriod = std::chrono::seconds(30); + namespace { // Group policy: WSLContainerRegistryAllowlist restricts which container-image @@ -649,16 +655,41 @@ void WSLCSession::IdleWorker() const HANDLE handles[] = {m_idleCheckEvent.get(), m_sessionTerminatingEvent.get()}; + // Absolute time at which a continuously-idle VM becomes eligible for teardown. Unset while + // the VM is non-idle; (re)armed when the VM is first observed idle. The wait below times out + // at this deadline so teardown happens promptly once the grace period elapses. + std::optional idleDeadline; + for (;;) { - const auto wait = WaitForMultipleObjects(ARRAYSIZE(handles), handles, FALSE, INFINITE); - if (wait != WAIT_OBJECT_0) + DWORD timeout = INFINITE; + if (idleDeadline.has_value()) + { + const auto now = std::chrono::steady_clock::now(); + timeout = (*idleDeadline <= now) + ? 0 + : static_cast(std::chrono::duration_cast(*idleDeadline - now).count()); + } + + const auto wait = WaitForMultipleObjects(ARRAYSIZE(handles), handles, FALSE, timeout); + + // handles[1] (session terminating) or a wait failure ends the worker. handles[0] (idle + // check) and WAIT_TIMEOUT (grace period may have elapsed) both trigger a re-evaluation. + if (wait != WAIT_OBJECT_0 && wait != WAIT_TIMEOUT) { - // Session is terminating (or the wait failed) — exit the worker. break; } - m_idleCheckEvent.ResetEvent(); + if (wait == WAIT_OBJECT_0) + { + m_idleCheckEvent.ResetEvent(); + + // An explicit idle-check signal means an operation or container state change just + // completed (it is raised on every lease/token release and terminal state change). + // Restart the grace clock so teardown happens a full grace period after the last + // activity, not after the first time the VM was ever observed idle. + idleDeadline.reset(); + } if (m_terminating.load()) { @@ -667,6 +698,7 @@ void WSLCSession::IdleWorker() if (!IdleTerminationEnabled()) { + idleDeadline.reset(); continue; } @@ -676,6 +708,7 @@ void WSLCSession::IdleWorker() if (m_terminating.load() || m_vmState.load() != VmState::Running) { + idleDeadline.reset(); continue; } @@ -687,15 +720,34 @@ void WSLCSession::IdleWorker() // path, whose Stop() self-join is a no-op and therefore cannot deadlock. if (m_vmExitedEvent && m_vmExitedEvent.is_signaled()) { + idleDeadline.reset(); continue; } - // Keep the VM alive while any operation is in flight or any container is non-terminal. + // Keep the VM alive while any operation is in flight or any container is non-terminal, + // and restart the grace clock so a fresh idle period is required afterwards. if (m_activityCount.load() != 0 || HasActiveContainerLockHeld()) + { + idleDeadline.reset(); + continue; + } + + // The VM is idle. Arm the grace period if the clock is not already running, then + // defer teardown until it has fully elapsed. The clock is reset by any non-idle + // observation or explicit idle-check signal (see the WAIT_OBJECT_0 handling above), + // so a WAIT_TIMEOUT wake here means the VM has been idle for the whole grace period. + const auto now = std::chrono::steady_clock::now(); + if (!idleDeadline.has_value()) + { + idleDeadline = now + c_vmIdleGracePeriod; + } + + if (now < *idleDeadline) { continue; } + idleDeadline.reset(); StopVmLockHeld(); } CATCH_LOG(); From 38e1ba39465741d9742621f1f222d1a4ec0f20fb Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Thu, 11 Jun 2026 09:56:03 -0700 Subject: [PATCH 10/22] wslcsession: keep VM alive for the lifetime of a root-namespace process A root-namespace process created via CreateRootNamespaceProcess is not tracked as a container, so it did not contribute to the session activity count or HasActiveContainerLockHeld(). The VmLease taken during creation was released when the call returned, leaving the process eligible for idle teardown: once the grace period elapsed the idle worker could stop the VM and kill a long-lived root process (e.g. a plugin host) out from under the client. Bind an activity token to the returned WSLCProcess so the VM stays alive for as long as the client holds the process proxy. Factor the existing BeginContainerOperation token logic into CreateActivityToken() and reuse it here. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/windows/wslcsession/WSLCProcess.h | 10 +++ src/windows/wslcsession/WSLCSession.cpp | 91 +++++++++++++------- src/windows/wslcsession/WSLCSession.h | 23 ++++- test/windows/wslc/e2e/WSLCE2EVmIdleTests.cpp | 33 ++++++- 4 files changed, 119 insertions(+), 38 deletions(-) diff --git a/src/windows/wslcsession/WSLCProcess.h b/src/windows/wslcsession/WSLCProcess.h index cce5543d91..bc4bc0c04d 100644 --- a/src/windows/wslcsession/WSLCProcess.h +++ b/src/windows/wslcsession/WSLCProcess.h @@ -41,9 +41,19 @@ class DECLSPEC_UUID("AFBEA6D6-D8A4-4F81-8FED-F947EB74B33B") WSLCProcess HANDLE GetExitEvent(); int GetPid() const; + // Attaches an opaque keep-alive token whose lifetime is bound to this process object. A + // root-namespace process is not tracked as a container, so it relies on this token to hold an + // activity reference on the owning session for as long as the client keeps the process alive, + // preventing the idle worker from tearing the VM down (and killing the process) underneath it. + void SetKeepAliveToken(Microsoft::WRL::ComPtr&& Token) noexcept + { + m_keepAliveToken = std::move(Token); + } + private: WSLCProcessFlags m_flags; std::shared_ptr m_control; std::unique_ptr m_io; + Microsoft::WRL::ComPtr m_keepAliveToken; }; } // namespace wsl::windows::service::wslc \ No newline at end of file diff --git a/src/windows/wslcsession/WSLCSession.cpp b/src/windows/wslcsession/WSLCSession.cpp index 2afc473ec2..8d845e2557 100644 --- a/src/windows/wslcsession/WSLCSession.cpp +++ b/src/windows/wslcsession/WSLCSession.cpp @@ -641,9 +641,9 @@ bool WSLCSession::HasActiveContainerLockHeld() void WSLCSession::RequestIdleCheck() noexcept { - if (m_idleCheckEvent) + if (m_idleState->IdleCheckEvent) { - m_idleCheckEvent.SetEvent(); + m_idleState->IdleCheckEvent.SetEvent(); } } @@ -653,7 +653,7 @@ void WSLCSession::IdleWorker() // thread must join the process MTA; otherwise those Release/calls fail with RPC_E_WRONG_THREAD. const auto coInit = wil::CoInitializeEx(COINIT_MULTITHREADED); - const HANDLE handles[] = {m_idleCheckEvent.get(), m_sessionTerminatingEvent.get()}; + const HANDLE handles[] = {m_idleState->IdleCheckEvent.get(), m_sessionTerminatingEvent.get()}; // Absolute time at which a continuously-idle VM becomes eligible for teardown. Unset while // the VM is non-idle; (re)armed when the VM is first observed idle. The wait below times out @@ -682,7 +682,7 @@ void WSLCSession::IdleWorker() if (wait == WAIT_OBJECT_0) { - m_idleCheckEvent.ResetEvent(); + m_idleState->IdleCheckEvent.ResetEvent(); // An explicit idle-check signal means an operation or container state change just // completed (it is raised on every lease/token release and terminal state change). @@ -726,7 +726,7 @@ void WSLCSession::IdleWorker() // Keep the VM alive while any operation is in flight or any container is non-terminal, // and restart the grace clock so a fresh idle period is required afterwards. - if (m_activityCount.load() != 0 || HasActiveContainerLockHeld()) + if (m_idleState->ActivityCount.load() != 0 || HasActiveContainerLockHeld()) { idleDeadline.reset(); continue; @@ -763,10 +763,10 @@ WSLCSession::VmLease::VmLease(WSLCSession& Session) : m_session(&Session) { // Record an in-flight operation before bringing the VM up so the idle worker cannot tear // it down between EnsureVmRunning() and acquiring the shared lock. - m_session->m_activityCount.fetch_add(1); + m_session->m_idleState->ActivityCount.fetch_add(1); auto countCleanup = wil::scope_exit([this]() { - m_session->m_activityCount.fetch_sub(1); + m_session->m_idleState->ActivityCount.fetch_sub(1); m_session = nullptr; }); @@ -805,7 +805,7 @@ WSLCSession::VmLease& WSLCSession::VmLease::operator=(VmLease&& Other) noexcept if (m_session != nullptr) { m_lock.reset(); - m_session->m_activityCount.fetch_sub(1); + m_session->m_idleState->ActivityCount.fetch_sub(1); m_session->RequestIdleCheck(); } @@ -823,7 +823,7 @@ WSLCSession::VmLease::~VmLease() // Release the shared lock before triggering the idle check so the idle worker can // immediately take the exclusive lock if the session is now idle. m_lock.reset(); - m_session->m_activityCount.fetch_sub(1); + m_session->m_idleState->ActivityCount.fetch_sub(1); m_session->RequestIdleCheck(); } } @@ -2348,17 +2348,19 @@ CATCH_RETURN(); namespace { - // Activity token returned by WSLCSession::BeginContainerOperation. While a client holds it, the - // session's activity count is non-zero, so the idle worker will not tear the VM down. It runs a - // release callback (which holds a strong reference to the session and decrements the activity - // count) when the client releases it or exits. It implements IFastRundown so that if the - // holding client crashes, COM reclaims the stub promptly (rather than via slow default rundown) - // and the VM can idle-terminate without a multi-minute delay. + // Activity token created by WSLCSession::CreateActivityToken (returned directly by + // BeginContainerOperation, and bound to a root-namespace process's lifetime by + // CreateRootNamespaceProcess). While a client holds it, the session's activity count is + // non-zero, so the idle worker will not tear the VM down. It runs a release callback (which + // decrements the activity count via the shared IdleState, without keeping the session alive) + // when the client releases it or exits. It implements IFastRundown so that if the holding + // client crashes, COM reclaims the stub promptly (rather than via slow default rundown) and the + // VM can idle-terminate without a multi-minute delay. class ContainerOperation : public Microsoft::WRL::RuntimeClass, IUnknown, IFastRundown> { public: - // Adopts an activity-count reference already taken by BeginContainerOperation; the callback + // Adopts an activity-count reference already taken by CreateActivityToken; the callback // releases it. void Initialize(std::function&& onRelease) noexcept { @@ -2379,36 +2381,51 @@ namespace { } // namespace -HRESULT WSLCSession::BeginContainerOperation(IUnknown** Operation) -try +Microsoft::WRL::ComPtr WSLCSession::CreateActivityToken() { - WSLCExecutionContext context(this); - - RETURN_HR_IF_NULL(E_POINTER, Operation); - *Operation = nullptr; - - // Record the in-flight operation up front so the VM cannot idle-terminate before the client - // resolves the container and issues the operation (and streams any output). - m_activityCount.fetch_add(1); + // Record the in-flight activity up front so the VM cannot idle-terminate before the caller + // takes ownership of the returned token. + m_idleState->ActivityCount.fetch_add(1); auto countCleanup = wil::scope_exit([this]() { - m_activityCount.fetch_sub(1); + m_idleState->ActivityCount.fetch_sub(1); RequestIdleCheck(); }); auto operation = Microsoft::WRL::Make(); THROW_IF_NULL_ALLOC(operation.Get()); - // Keep the session alive while the token is held so the release callback is always valid. - Microsoft::WRL::ComPtr self = this; - operation->Initialize([self = std::move(self)]() { - self->m_activityCount.fetch_sub(1); - self->RequestIdleCheck(); + // Capture the shared idle state rather than the session itself: the token may outlive the + // session (e.g. a client keeps a root-namespace process proxy past releasing the session), and + // it must not keep the session alive. On release it decrements the activity count and wakes the + // idle worker; if the session is already gone this is a harmless no-op (no idle worker waits on + // the event). N.B. releasing the token therefore never blocks an explicit session teardown. + std::shared_ptr idleState = m_idleState; + operation->Initialize([idleState = std::move(idleState)]() { + idleState->ActivityCount.fetch_sub(1); + idleState->IdleCheckEvent.SetEvent(); }); // The token now owns the activity-count reference and will release it on destruction. countCleanup.release(); - RETURN_IF_FAILED(operation.CopyTo(Operation)); + Microsoft::WRL::ComPtr token; + THROW_IF_FAILED(operation.As(&token)); + return token; +} + +HRESULT WSLCSession::BeginContainerOperation(IUnknown** Operation) +try +{ + WSLCExecutionContext context(this); + + RETURN_HR_IF_NULL(E_POINTER, Operation); + *Operation = nullptr; + + // Record the in-flight operation up front so the VM cannot idle-terminate before the client + // resolves the container and issues the operation (and streams any output). + auto token = CreateActivityToken(); + + RETURN_IF_FAILED(token.CopyTo(Operation)); return S_OK; } CATCH_RETURN(); @@ -2601,6 +2618,14 @@ try THROW_HR_IF(HRESULT_FROM_WIN32(ERROR_INVALID_STATE), !m_virtualMachine); auto process = m_virtualMachine->CreateLinuxProcess(Executable, *Options, TtyRows, TtyColumns, Errno); + + // The VmLease above is released when this call returns, but the process keeps running in the + // VM and the client holds the returned proxy. A root-namespace process is not tracked as a + // container, so attach an activity token bound to the process's lifetime; this keeps the VM + // alive for as long as the client holds the process, preventing the idle worker from tearing + // the VM down and killing the process out from under the client. + process->SetKeepAliveToken(CreateActivityToken()); + THROW_IF_FAILED(process.CopyTo(Process)); return S_OK; diff --git a/src/windows/wslcsession/WSLCSession.h b/src/windows/wslcsession/WSLCSession.h index 2cb745cc5c..9c2a40cc00 100644 --- a/src/windows/wslcsession/WSLCSession.h +++ b/src/windows/wslcsession/WSLCSession.h @@ -240,6 +240,17 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession Stopping, }; + // Idle-activity state shared between the session and any outstanding activity tokens. Held via + // shared_ptr so a token can outlive the session (e.g. a client keeps a root-namespace process + // proxy past releasing the session) and still safely release its activity reference without + // keeping the session object alive. Tearing down the session therefore proceeds normally; a + // late token release simply decrements the count and signals an event with no waiter. + struct IdleState + { + std::atomic ActivityCount{0}; + wil::unique_event IdleCheckEvent{wil::EventOptions::ManualReset}; + }; + _Requires_exclusive_lock_held_(m_lock) void StartVmLockHeld(); _Requires_exclusive_lock_held_(m_lock) @@ -249,6 +260,13 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession _Requires_exclusive_lock_held_(m_lock) bool HasActiveContainerLockHeld(); void EnsureVmRunning(); + + // Creates an opaque activity token that holds a reference on this session's activity count for + // its lifetime, deferring idle teardown of the VM until every outstanding token is released. + // Used both for transient client operations (BeginContainerOperation) and to keep the VM alive + // for the lifetime of a root-namespace process. + Microsoft::WRL::ComPtr CreateActivityToken(); + void IdleWorker(); bool IdleTerminationEnabled() const noexcept; void PersistSettings(const WSLCSessionInitSettings& Settings, PSID UserSid); @@ -349,11 +367,12 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession // VM lifecycle / idle-termination state. std::atomic m_vmState{VmState::None}; - std::atomic m_activityCount{0}; std::atomic m_vmStopRequested{false}; // Number of times the VM has been (re)created; surfaced via GetVmDiagnostics. std::atomic m_vmStartCount{0}; - wil::unique_event m_idleCheckEvent{wil::EventOptions::ManualReset}; + // In-flight activity count and idle-worker wake event, decoupled from this object's lifetime + // (see IdleState) so activity tokens never extend the session's lifetime. + std::shared_ptr m_idleState{std::make_shared()}; std::thread m_idleThread; // Persisted settings required to (re)create the VM on demand. The string fields point diff --git a/test/windows/wslc/e2e/WSLCE2EVmIdleTests.cpp b/test/windows/wslc/e2e/WSLCE2EVmIdleTests.cpp index cf8783db8e..b88c212d2c 100644 --- a/test/windows/wslc/e2e/WSLCE2EVmIdleTests.cpp +++ b/test/windows/wslc/e2e/WSLCE2EVmIdleTests.cpp @@ -20,6 +20,7 @@ Module Name: #include "windows/Common.h" #include "WSLCExecutor.h" #include "WSLCE2EHelpers.h" +#include namespace WSLCE2ETests { using namespace wsl::shared; @@ -114,13 +115,39 @@ class WSLCE2EVmIdleTests VERIFY_IS_TRUE(static_cast(QueryDiagnostics(session).Running)); // Removing the only container drops the active count to zero and the VM idles. - RunWslcAndVerify( - std::format(L"container rm --session {} {}", session.Name(), containerName), {.Stderr = L"", .ExitCode = 0}); + RunWslcAndVerify(std::format(L"container rm --session {} {}", session.Name(), containerName), {.Stderr = L"", .ExitCode = 0}); + + WaitForVmRunningState(session, false); + } + + // A long-lived root-namespace process (created via CreateRootNamespaceProcess) is not tracked + // as a container, so it does not contribute to the active-container check. It must nonetheless + // keep the VM alive for as long as the client holds the returned process, via the activity + // token bound to the process's lifetime. Without that token the idle worker would tear the VM + // down once the grace period elapsed, killing the process out from under the client. + WSLC_TEST_METHOD(WSLCE2E_VmIdle_RootProcessKeepsVmAlive) + { + auto session = TestSession::Create(L"wslc-vmidle-rootproc"); + + // Launch a long-running root-namespace process and keep the returned process object alive. + // This brings the VM up on demand to host the process. + wsl::windows::common::WSLCProcessLauncher launcher("/bin/sleep", {"/bin/sleep", "3600"}); + std::optional process = launcher.Launch(*session.Session()); + + WaitForVmRunningState(session, true); + + // The VM must remain running past the idle grace period (30s) while the process is held, + // even though there are no containers and no in-flight operations. Without the keep-alive + // token the idle worker would have torn the VM down ~30s after the creating call returned, + // so a generous margin past the grace period reliably catches that regression. + std::this_thread::sleep_for(std::chrono::seconds(40)); + VERIFY_IS_TRUE(static_cast(QueryDiagnostics(session).Running)); + // Releasing the process proxy drops the activity count to zero and the VM idle-terminates. + process.reset(); WaitForVmRunningState(session, false); } - // Stress the lease-vs-idle-teardown race: each run idles the VM, and the next run's lease // arrives while teardown may still be in flight. All operations must succeed (no spurious // ERROR_INVALID_STATE from racing a VM that is stopping). WSLC_TEST_METHOD(WSLCE2E_VmIdle_ConcurrentRecreateDoesNotFail) From f38c5568b6bf581259b9e1df83286483e35ee598 Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Thu, 11 Jun 2026 13:06:21 -0700 Subject: [PATCH 11/22] wslcsession: apply clang-format 19.1.5 formatting Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../wslc/services/ContainerService.cpp | 10 +++--- src/windows/wslcsession/WSLCSession.cpp | 36 ++++++++----------- 2 files changed, 18 insertions(+), 28 deletions(-) diff --git a/src/windows/wslc/services/ContainerService.cpp b/src/windows/wslc/services/ContainerService.cpp index 49e3a250ba..4a712190aa 100644 --- a/src/windows/wslc/services/ContainerService.cpp +++ b/src/windows/wslc/services/ContainerService.cpp @@ -604,15 +604,13 @@ void ContainerService::Logs(Session& session, const std::string& id, bool follow THROW_IF_FAILED(container->Logs(flags, &stdoutHandle, &stderrHandle, since, until, tail)); wsl::windows::common::io::MultiHandleWait io; - io.AddHandle( - std::make_unique>( - stdoutHandle.Release(), GetStdHandle(STD_OUTPUT_HANDLE))); + io.AddHandle(std::make_unique>( + stdoutHandle.Release(), GetStdHandle(STD_OUTPUT_HANDLE))); if (!stderrHandle.Empty()) // This handle is only used for non-tty processes. { - io.AddHandle( - std::make_unique>( - stderrHandle.Release(), GetStdHandle(STD_ERROR_HANDLE))); + io.AddHandle(std::make_unique>( + stderrHandle.Release(), GetStdHandle(STD_ERROR_HANDLE))); } // TODO: Handle ctrl-c. diff --git a/src/windows/wslcsession/WSLCSession.cpp b/src/windows/wslcsession/WSLCSession.cpp index 8d845e2557..041c2f9ba7 100644 --- a/src/windows/wslcsession/WSLCSession.cpp +++ b/src/windows/wslcsession/WSLCSession.cpp @@ -1012,13 +1012,11 @@ ServiceRunningProcess WSLCSession::StartProcess( auto process = launcher.Launch(*m_virtualMachine); - m_ioRelay->AddHandle( - std::make_unique( - process.GetStdHandle(1), [this, LogSource](const auto& data) { OnProcessLog(data, LogSource); }, false)); + m_ioRelay->AddHandle(std::make_unique( + process.GetStdHandle(1), [this, LogSource](const auto& data) { OnProcessLog(data, LogSource); }, false)); - m_ioRelay->AddHandle( - std::make_unique( - process.GetStdHandle(2), [this, LogSource](const auto& data) { OnProcessLog(data, LogSource); }, false)); + m_ioRelay->AddHandle(std::make_unique( + process.GetStdHandle(2), [this, LogSource](const auto& data) { OnProcessLog(data, LogSource); }, false)); m_ioRelay->AddHandle(std::make_unique(process.GetExitEvent(), std::move(ExitCallback))); @@ -1128,9 +1126,8 @@ void WSLCSession::StreamImageOperation(DockerHTTPClient::HTTPRequestContext& req auto onCompleted = [&]() { io.Cancel(); }; - io.AddHandle( - std::make_unique( - requestContext, std::move(onHttpResponse), std::move(onChunk), std::move(onCompleted))); + io.AddHandle(std::make_unique( + requestContext, std::move(onHttpResponse), std::move(onChunk), std::move(onCompleted))); io.Run({}); @@ -1302,9 +1299,8 @@ try auto io = CreateIOContext(); - io.AddHandle( - std::make_unique>( - buildFileHandle.Get(), common::io::HandleWrapper{buildProcess.GetStdHandle(WSLCFDStdin)})); + io.AddHandle(std::make_unique>( + buildFileHandle.Get(), common::io::HandleWrapper{buildProcess.GetStdHandle(WSLCFDStdin)})); bool verbose = WI_IsFlagSet(Options->Flags, WSLCBuildImageFlagsVerbose); std::string allOutput; @@ -1673,10 +1669,8 @@ void WSLCSession::ImportImageImpl(DockerHTTPClient::HTTPRequestContext& Request, LOG_LAST_ERROR_IF(shutdown(socket, SD_SEND) == SOCKET_ERROR); }; - io.AddHandle( - std::make_unique>( - common::io::HandleWrapper{userHandle.Get(), std::move(onInputComplete)}, - common::io::HandleWrapper{Request.stream.native_handle()})); + io.AddHandle(std::make_unique>( + common::io::HandleWrapper{userHandle.Get(), std::move(onInputComplete)}, common::io::HandleWrapper{Request.stream.native_handle()})); io.AddHandle( std::make_unique(Request, std::move(onHttpResponse), std::move(onProgress)), @@ -3353,14 +3347,12 @@ MultiHandleWait WSLCSession::CreateIOContext(HANDLE CancelHandle) io::MultiHandleWait io; // Cancel with E_ABORT if the session is terminating. - io.AddHandle(std::make_unique(m_sessionTerminatingEvent.get(), [this]() { - THROW_HR_MSG(E_ABORT, "Session %lu is terminating", m_id); - })); + io.AddHandle(std::make_unique( + m_sessionTerminatingEvent.get(), [this]() { THROW_HR_MSG(E_ABORT, "Session %lu is terminating", m_id); })); // Cancel with E_ABORT if the client process exits. - io.AddHandle(std::make_unique(wslutil::OpenCallingProcess(SYNCHRONIZE), [this]() { - THROW_HR_MSG(E_ABORT, "Client process has exited"); - })); + io.AddHandle(std::make_unique( + wslutil::OpenCallingProcess(SYNCHRONIZE), [this]() { THROW_HR_MSG(E_ABORT, "Client process has exited"); })); if (CancelHandle != nullptr) { From 7a08418826b7465d7ee99414fa823168b92bf475 Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Thu, 11 Jun 2026 13:10:10 -0700 Subject: [PATCH 12/22] wslcsession: update WSLCSession class comment for factory-based lazy VM lifecycle Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/windows/wslcsession/WSLCSession.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/windows/wslcsession/WSLCSession.h b/src/windows/wslcsession/WSLCSession.h index 9c2a40cc00..b83e8c175a 100644 --- a/src/windows/wslcsession/WSLCSession.h +++ b/src/windows/wslcsession/WSLCSession.h @@ -73,7 +73,8 @@ class UserCOMCallback // // WSLCSession - Implements IWSLCSession for container management. // Runs in a per-user COM server process for security isolation. -// The SYSTEM service creates the VM and passes IWSLCVirtualMachine to Initialize(). +// The SYSTEM service passes an IWSLCVirtualMachineFactory to Initialize(); the VM is created +// lazily on first use and may be torn down when idle and recreated on demand. // class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession : public Microsoft::WRL::RuntimeClass, IWSLCSession, IFastRundown, ISupportErrorInfo> From 5e599bde91b62625406bc03f40877660baad39aa Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Thu, 11 Jun 2026 15:41:27 -0700 Subject: [PATCH 13/22] wslcsession: resolve rebase artifacts from termination-event integration Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/windows/wslcsession/WSLCSession.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/windows/wslcsession/WSLCSession.cpp b/src/windows/wslcsession/WSLCSession.cpp index 041c2f9ba7..aeef497988 100644 --- a/src/windows/wslcsession/WSLCSession.cpp +++ b/src/windows/wslcsession/WSLCSession.cpp @@ -3139,9 +3139,6 @@ try // Acquire an exclusive lock to ensure that no operation is running. WI_VERIFY(sessionLock); - // Acquire an exclusive lock to ensure that no operation is running. - WI_VERIFY(sessionLock); - // Tear down the VM (if running) and all VM-scoped state, capturing the termination reason. // This mirrors the soft teardown used for idle shutdown, but here it is permanent. TearDownVmLockHeld(/* CaptureTerminationReason */ true); @@ -3162,7 +3159,7 @@ try m_idleThread.join(); } - // The idle worker has exited and no operation can run past m_terminated, so the parked VM + // The idle worker has exited and no operation can run past termination, so the parked VM // factory can no longer be re-fetched; revoke it from the GIT. if (m_vmFactoryGitCookie != 0) { From 3a01811b24372e92298523b0b3fa8ec67177153e Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Thu, 11 Jun 2026 15:44:06 -0700 Subject: [PATCH 14/22] wslc: remove dead TerminationCallback plumbing from VM factory PR #40767 (terminateEvent) removed ITerminationCallback / WSLCSessionSettings.TerminationCallback from the IDL but left the WSLCVirtualMachineFactory (introduced later by factory PR #40770, which its branch merged from master) still referencing them, so the branch did not compile. Drop the dead member and assignments; the VM now caches the reason for WSLCSession to pull. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/windows/service/exe/HcsVirtualMachine.cpp | 2 -- src/windows/service/exe/HcsVirtualMachine.h | 2 -- 2 files changed, 4 deletions(-) diff --git a/src/windows/service/exe/HcsVirtualMachine.cpp b/src/windows/service/exe/HcsVirtualMachine.cpp index fb6a1d0bd7..7cdbf7afaf 100644 --- a/src/windows/service/exe/HcsVirtualMachine.cpp +++ b/src/windows/service/exe/HcsVirtualMachine.cpp @@ -872,7 +872,6 @@ WSLCVirtualMachineFactory::WSLCVirtualMachineFactory(_In_ const WSLCSessionSetti m_dmesgOutput.reset(wslutil::DuplicateHandle(wslutil::FromCOMInputHandle(Settings->DmesgOutput), GENERIC_WRITE | SYNCHRONIZE)); } - m_terminationCallback = Settings->TerminationCallback; m_maximumStorageSizeMb = Settings->MaximumStorageSizeMb; m_cpuCount = Settings->CpuCount; m_memoryMb = Settings->MemoryMb; @@ -892,7 +891,6 @@ WSLCSessionSettings WSLCVirtualMachineFactory::BuildSettings() settings.MemoryMb = m_memoryMb; settings.BootTimeoutMs = m_bootTimeoutMs; settings.NetworkingMode = m_networkingMode; - settings.TerminationCallback = m_terminationCallback.get(); settings.FeatureFlags = m_featureFlags; settings.StorageFlags = m_storageFlags; settings.RootVhdOverride = m_rootVhdOverride ? m_rootVhdOverride->c_str() : nullptr; diff --git a/src/windows/service/exe/HcsVirtualMachine.h b/src/windows/service/exe/HcsVirtualMachine.h index e59995c95d..2a862d1435 100644 --- a/src/windows/service/exe/HcsVirtualMachine.h +++ b/src/windows/service/exe/HcsVirtualMachine.h @@ -139,8 +139,6 @@ class WSLCVirtualMachineFactory // subsequent VMs reuse this duplicate, whose writes simply fail if the sink is gone. wil::unique_handle m_dmesgOutput; - wil::com_ptr m_terminationCallback; - ULONGLONG m_maximumStorageSizeMb{}; ULONG m_cpuCount{}; ULONG m_memoryMb{}; From 80282b5a542a063eca52aab93e990d17b05840b3 Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Thu, 11 Jun 2026 16:20:26 -0700 Subject: [PATCH 15/22] fix ~HcsVirtualMachine deadlock with OnExit on m_lock Kevin's PR #40767 (event-model termination) moved m_vmExitEvent.SetEvent() in OnExit() to after a new std::lock_guard(m_lock) that caches the termination reason. ~HcsVirtualMachine already holds m_lock across the 5s exit-event wait and HcsCloseComputeSystem (which drains in-flight HCS callbacks). An in-flight OnExit() therefore blocks acquiring m_lock, so it never signals the exit event nor drains, and the close never completes: a hard deadlock that StuckVmTermination reliably reproduces. Drop the broad lock from the dtor. By the time the compute system is closed no further callbacks can run, so the remaining teardown is safe unguarded. Flag to Kevin to fold into #40767. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/windows/service/exe/HcsVirtualMachine.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/windows/service/exe/HcsVirtualMachine.cpp b/src/windows/service/exe/HcsVirtualMachine.cpp index 7cdbf7afaf..84679878bb 100644 --- a/src/windows/service/exe/HcsVirtualMachine.cpp +++ b/src/windows/service/exe/HcsVirtualMachine.cpp @@ -313,7 +313,11 @@ HcsVirtualMachine::HcsVirtualMachine(_In_ const WSLCSessionSettings* Settings) HcsVirtualMachine::~HcsVirtualMachine() { - std::lock_guard lock(m_lock); + // N.B. Do not hold m_lock here. OnExit() acquires m_lock to cache the termination reason + // before signaling m_vmExitEvent, and closing the compute system below drains any in-flight + // HCS exit/crash callbacks. Holding m_lock across the exit-event wait and HcsCloseComputeSystem + // would deadlock against an in-flight OnExit() that is blocked acquiring m_lock. By the time the + // compute system is closed no further callbacks can run, so the remaining teardown is unguarded. // Wait up to 5 seconds for the VM to terminate gracefully. bool forceTerminate = false; From 6d43b6a0daa89a1d35fb6869054037ecfa598303 Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Thu, 11 Jun 2026 17:56:48 -0700 Subject: [PATCH 16/22] Preserve recovery warnings in lazy-VM model In the lazy-VM model, container/volume/network recovery runs at first VM start rather than during CreateSession, so the create-time WarningCallback was out of scope by the time recovery emitted warnings. Park the session's WarningCallback in the GIT and have WSLCExecutionContext fall back to it when an operation has no explicit callback, so lazy-recovery warnings still reach the user. CLI-side, keep the create/enter callback alive for the whole command by storing it in the Session model. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/windows/wslc/services/SessionModel.h | 8 +++++- src/windows/wslc/services/SessionService.cpp | 5 +++- .../wslcsession/WSLCExecutionContext.h | 16 +++++++++-- src/windows/wslcsession/WSLCSession.cpp | 27 +++++++++++++++++++ src/windows/wslcsession/WSLCSession.h | 12 +++++++++ test/windows/WSLCTests.cpp | 12 +++++++++ 6 files changed, 76 insertions(+), 4 deletions(-) diff --git a/src/windows/wslc/services/SessionModel.h b/src/windows/wslc/services/SessionModel.h index 6111bf2b44..48ce0fe800 100644 --- a/src/windows/wslc/services/SessionModel.h +++ b/src/windows/wslc/services/SessionModel.h @@ -19,7 +19,8 @@ namespace wsl::windows::wslc::models { struct Session { - explicit Session(wil::com_ptr session) : m_session(std::move(session)) + explicit Session(wil::com_ptr session, wil::com_ptr warningCallback = {}) : + m_session(std::move(session)), m_warningCallback(std::move(warningCallback)) { } IWSLCSession* Get() const noexcept @@ -39,6 +40,11 @@ struct Session private: wil::com_ptr m_session; + + // Kept alive for the lifetime of the session model (i.e. the whole CLI command) so the service + // can deliver warnings emitted by lazy/background work — such as resource recovery on the first + // VM start — back to this CLI invocation, even though no single COM call carries the callback. + wil::com_ptr m_warningCallback; }; } // namespace wsl::windows::wslc::models \ No newline at end of file diff --git a/src/windows/wslc/services/SessionService.cpp b/src/windows/wslc/services/SessionService.cpp index d382cfea62..eb160bfaab 100644 --- a/src/windows/wslc/services/SessionService.cpp +++ b/src/windows/wslc/services/SessionService.cpp @@ -118,7 +118,10 @@ Session SessionService::CreateDefaultSession() auto warningCallback = Microsoft::WRL::Make(); THROW_IF_FAILED(sessionManager->CreateSession(nullptr, WSLCSessionFlagsNone, warningCallback.Get(), &session)); wsl::windows::common::security::ConfigureForCOMImpersonation(session.get()); - return Session(std::move(session)); + + // Hold the warning callback for the lifetime of the session so warnings emitted by the lazy VM + // start (e.g. resource recovery) are still delivered to this CLI invocation. + return Session(std::move(session), wil::com_ptr(warningCallback.Get())); } int SessionService::Enter(const std::wstring& storagePath, const std::wstring& displayName) diff --git a/src/windows/wslcsession/WSLCExecutionContext.h b/src/windows/wslcsession/WSLCExecutionContext.h index 5d75bfed14..b3abec5a0e 100644 --- a/src/windows/wslcsession/WSLCExecutionContext.h +++ b/src/windows/wslcsession/WSLCExecutionContext.h @@ -27,7 +27,19 @@ class WSLCExecutionContext : public wsl::windows::common::COMServiceExecutionCon protected: bool CollectUserWarning(const std::wstring& warning) override { - if (m_warningCallback != nullptr) + IWarningCallback* callback = m_warningCallback; + + // When the operation carries no explicit callback, fall back to the callback supplied when + // the session was created/entered. This routes warnings emitted outside a callback-bearing + // operation (e.g. resource recovery during the lazy VM start) back to the session creator. + wil::com_ptr sessionCallback; + if (callback == nullptr && m_session != nullptr) + { + sessionCallback = m_session->AcquireWarningCallback(); + callback = sessionCallback.get(); + } + + if (callback != nullptr) { std::unique_ptr comCallback; if (m_session != nullptr) @@ -35,7 +47,7 @@ class WSLCExecutionContext : public wsl::windows::common::COMServiceExecutionCon comCallback = std::make_unique(m_session->RegisterUserCOMCallback()); } - auto hr = m_warningCallback->OnWarning(warning.c_str()); + auto hr = callback->OnWarning(warning.c_str()); if (SUCCEEDED(hr) || hr == RPC_E_CALL_CANCELED || hr == HRESULT_FROM_WIN32(ERROR_CANCELLED)) { return true; diff --git a/src/windows/wslcsession/WSLCSession.cpp b/src/windows/wslcsession/WSLCSession.cpp index aeef497988..4da7321341 100644 --- a/src/windows/wslcsession/WSLCSession.cpp +++ b/src/windows/wslcsession/WSLCSession.cpp @@ -329,6 +329,14 @@ try m_git = wil::CoCreateInstance(CLSID_StdGlobalInterfaceTable, CLSCTX_INPROC_SERVER); THROW_IF_FAILED(m_git->RegisterInterfaceInGlobal(VmFactory, __uuidof(IWSLCVirtualMachineFactory), &m_vmFactoryGitCookie)); + // Park the warning callback too. The VM (and resource recovery) is created lazily on the + // first operation, which may not carry its own warning callback, so recovery warnings are + // routed back to this callback via AcquireWarningCallback()/WSLCExecutionContext. + if (WarningCallback != nullptr) + { + THROW_IF_FAILED(m_git->RegisterInterfaceInGlobal(WarningCallback, __uuidof(IWarningCallback), &m_warningCallbackGitCookie)); + } + // Persist a deep copy of the settings (and the creating user's SID) required to // (re)create the VM on demand. const auto tokenInfo = wil::get_token_information(GetCurrentProcessToken()); @@ -3167,10 +3175,29 @@ try m_vmFactoryGitCookie = 0; } + if (m_warningCallbackGitCookie != 0) + { + LOG_IF_FAILED(m_git->RevokeInterfaceFromGlobal(m_warningCallbackGitCookie)); + m_warningCallbackGitCookie = 0; + } + return S_OK; } CATCH_RETURN(); +wil::com_ptr WSLCSession::AcquireWarningCallback() const +{ + wil::com_ptr callback; + if (m_warningCallbackGitCookie != 0) + { + // Best-effort: the creating client's proxy may already be gone (e.g. the CLI exited before + // a later VM restart), in which case the warning falls through to the default sink. + LOG_IF_FAILED(m_git->GetInterfaceFromGlobal(m_warningCallbackGitCookie, __uuidof(IWarningCallback), callback.put_void())); + } + + return callback; +} + HRESULT WSLCSession::RegisterCrashDumpCallback(_In_ ICrashDumpCallback* Callback, _Out_ IUnknown** Subscription) try { diff --git a/src/windows/wslcsession/WSLCSession.h b/src/windows/wslcsession/WSLCSession.h index b83e8c175a..f931b40f5a 100644 --- a/src/windows/wslcsession/WSLCSession.h +++ b/src/windows/wslcsession/WSLCSession.h @@ -213,6 +213,13 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession UserCOMCallback RegisterUserCOMCallback(); void UnregisterUserCOMCallback(DWORD ThreadId); + // Returns the warning callback supplied when the session was created/entered, re-marshalled + // into the calling apartment. Used as a fallback by WSLCExecutionContext so that warnings + // emitted by operations that carry no explicit callback (e.g. resource recovery during the + // lazy VM start) still reach the session creator. Returns null if no callback was supplied + // or the creating client's proxy is no longer reachable. + wil::com_ptr AcquireWarningCallback() const; + HANDLE SessionTerminatingEvent() const noexcept { return m_sessionTerminatingEvent.get(); @@ -340,6 +347,11 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession wil::com_ptr m_git; DWORD m_vmFactoryGitCookie{}; + // The warning callback supplied at Initialize() is parked in the GIT for the same reason as + // the VM factory: it is used later, on demand, from other threads/apartments (a directly + // stored proxy would fail with RPC_E_WRONG_THREAD). Zero if no callback was supplied. + DWORD m_warningCallbackGitCookie{}; + std::optional m_virtualMachine; std::optional m_eventTracker; wil::unique_event m_dockerdReadyEvent{wil::EventOptions::ManualReset}; diff --git a/test/windows/WSLCTests.cpp b/test/windows/WSLCTests.cpp index 74d8cb05e7..a30a8911ab 100644 --- a/test/windows/WSLCTests.cpp +++ b/test/windows/WSLCTests.cpp @@ -10338,6 +10338,10 @@ class WSLCTests auto settings = GetDefaultSessionSettings(c_sessionName); auto session = CreateSession(settings); + // Session creation is lazy, so start the VM by launching a process before killing it. + WSLCProcessLauncher launcher("/bin/sleep", {"/bin/sleep", "60"}); + auto process = launcher.Launch(*session); + KillVmByOwner(c_sessionName); WaitForSessionTermination(session.get()); @@ -10429,6 +10433,10 @@ class WSLCTests VERIFY_SUCCEEDED(sessionManager2->CreateSession(&settings2, WSLCSessionFlagsNone, warningCallback.Get(), &session2)); wsl::windows::common::security::ConfigureForCOMImpersonation(session2.get()); + // The VM (and container recovery) starts lazily on the first operation. Trigger it so + // recovery runs and its warning is delivered to the session's warning callback. + WSLCProcessLauncher("/bin/sh", {"/bin/sh", "-c", "exit 0"}).Launch(*session2).GetExitEvent().wait(30000); + // Verify the warning matches the expected localized message for the corrupt container. auto warnings = warningCallback->GetWarnings(); auto expectedWarning = std::format( @@ -10497,6 +10505,10 @@ class WSLCTests VERIFY_SUCCEEDED(sessionManager->CreateSession(&settings, WSLCSessionFlagsNone, warningCallback.Get(), &session)); wsl::windows::common::security::ConfigureForCOMImpersonation(session.get()); + // The VM (and volume recovery) starts lazily on the first operation. Trigger it so + // recovery runs and its warning is delivered to the session's warning callback. + WSLCProcessLauncher("/bin/sh", {"/bin/sh", "-c", "exit 0"}).Launch(*session).GetExitEvent().wait(30000); + // Verify the warning matches the expected localized message for the missing volume. auto warnings = warningCallback->GetWarnings(); auto expectedWarning = From cb8e2404d0cdedc9bfbc5a9ea6236fb60c62cd60 Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Thu, 11 Jun 2026 18:11:40 -0700 Subject: [PATCH 17/22] Fix lazy-VM idle worker serializing concurrent operations ConfigureStorage validates the storage path lazily (via AttachDisk at first VM start), so with a lazily-created VM a WSLCSessionStorageFlagsNoCreate session pointing at a missing path no longer failed at CreateSession. Validate the storage VHD existence eagerly in Initialize so misconfiguration is reported up front. The idle worker acquired m_lock exclusively (blocking) on every wake. Because SRW locks favor a waiting writer, that pending acquire stalled all new shared-lock operations behind it, so a long-running operation holding its shared VmLease (e.g. a blocking SaveImage/Export) serialized every concurrent operation until it completed. Use try_lock_exclusive and treat contention as activity, re-evaluating on the next idle-check signal. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/windows/wslcsession/WSLCSession.cpp | 29 +++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/src/windows/wslcsession/WSLCSession.cpp b/src/windows/wslcsession/WSLCSession.cpp index 4da7321341..281f729167 100644 --- a/src/windows/wslcsession/WSLCSession.cpp +++ b/src/windows/wslcsession/WSLCSession.cpp @@ -316,6 +316,20 @@ try // failures are streamed to the CLI. WSLCExecutionContext warningContext(this, WarningCallback); + // The VM (and storage VHD) is created lazily on the first operation. Validate the storage + // configuration eagerly here so misconfiguration is reported at session creation rather than + // surfacing later on the first VM-starting operation. With WSLCSessionStorageFlagsNoCreate the + // storage VHD must already exist (ConfigureStorage will not create it). + if (Settings->StoragePath != nullptr && WI_IsFlagSet(Settings->StorageFlags, WSLCSessionStorageFlagsNoCreate)) + { + const std::filesystem::path storagePath{Settings->StoragePath}; + THROW_HR_WITH_USER_ERROR_IF(E_INVALIDARG, Localization::MessagePathNotAbsolute(Settings->StoragePath), !storagePath.is_absolute()); + THROW_HR_WITH_USER_ERROR_IF( + HRESULT_FROM_WIN32(ERROR_PATH_NOT_FOUND), + Localization::MessageWslcSessionStorageNotFound(Settings->StoragePath), + !std::filesystem::exists(storagePath / "storage.vhdx")); + } + // N.B. No locking is required because Initialize() is always called before the session is returned to the caller. m_id = Settings->SessionId; m_displayName = Settings->DisplayName ? Settings->DisplayName : L""; @@ -712,8 +726,19 @@ void WSLCSession::IdleWorker() try { - auto lock = m_lock.lock_exclusive(); - + // Use a non-blocking acquire. A blocking exclusive acquire would queue behind any + // in-flight operation's shared VmLease and, because SRW locks favor a waiting writer, + // would stall every new operation behind it until that operation completed. A + // long-running operation (e.g. a blocking SaveImage/Export) would therefore serialize + // all concurrent operations. If the lock is currently held an operation is in flight, + // so treat it as activity and re-evaluate on the next idle-check signal (raised when + // that operation releases its lease). + auto lock = m_lock.try_lock_exclusive(); + if (!lock) + { + idleDeadline.reset(); + continue; + } if (m_terminating.load() || m_vmState.load() != VmState::Running) { idleDeadline.reset(); From 2987fa80761e1f461f8223f30ef7624c95c34365 Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Thu, 11 Jun 2026 18:17:18 -0700 Subject: [PATCH 18/22] Adapt SessionEnter storage-not-found E2E to eager friendly message Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- test/windows/wslc/e2e/WSLCE2ESessionEnterTests.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/windows/wslc/e2e/WSLCE2ESessionEnterTests.cpp b/test/windows/wslc/e2e/WSLCE2ESessionEnterTests.cpp index c10f432951..c512452cf8 100644 --- a/test/windows/wslc/e2e/WSLCE2ESessionEnterTests.cpp +++ b/test/windows/wslc/e2e/WSLCE2ESessionEnterTests.cpp @@ -109,8 +109,14 @@ class WSLCE2ESessionEnterTests WSLC_TEST_METHOD(WSLCE2E_SessionEnter_StoragePathNotFound) { auto result = RunWslc(L"system session enter does-not-exist"); + + // The CLI resolves the storage argument to an absolute path (see EnterSession task) and the + // service validates it eagerly at session creation, reporting the friendly "No WSLC session + // found in ''" message rather than a bare system error. + const auto storagePath = std::filesystem::absolute(L"does-not-exist").wstring(); result.Verify({ - .Stderr = L"The system cannot find the path specified. \r\nError code: ERROR_PATH_NOT_FOUND\r\n", + .Stderr = wsl::shared::Localization::MessageWslcSessionStorageNotFound(storagePath) + + L"\r\nError code: ERROR_PATH_NOT_FOUND\r\n", .ExitCode = 1, }); } From 46eda9e6134058236535faa23ba34bdd14172290 Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Fri, 12 Jun 2026 08:36:45 -0700 Subject: [PATCH 19/22] Guard idle worker thread entrypoint against unhandled exceptions Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/windows/wslcsession/WSLCSession.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/windows/wslcsession/WSLCSession.cpp b/src/windows/wslcsession/WSLCSession.cpp index 281f729167..7644fffd3c 100644 --- a/src/windows/wslcsession/WSLCSession.cpp +++ b/src/windows/wslcsession/WSLCSession.cpp @@ -364,7 +364,16 @@ try // The VM is created lazily on the first operation that requires it (see EnsureVmRunning) // and torn down when the session becomes idle. Start the worker that performs idle teardown. - m_idleThread = std::thread([this]() { IdleWorker(); }); + // The body is wrapped so an unexpected throw (e.g. COM initialization failure before the + // worker's own try/catch loop) is logged rather than escaping the thread and calling + // std::terminate(), which would crash the session process. + m_idleThread = std::thread([this]() { + try + { + IdleWorker(); + } + CATCH_LOG() + }); return S_OK; } From ea2254cf60a19f5f1e8b6629a09908b0fa1ad18e Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Fri, 12 Jun 2026 10:46:36 -0700 Subject: [PATCH 20/22] Pin VM while client holds container/process proxies Idle teardown destroyed container impls and disconnected their COM wrappers, leaving any outstanding client proxy with RPC_E_DISCONNECTED. Keep the VM alive while a client still holds a proxy: - Containers: HasActiveContainerLockHeld now also keeps the VM up while a container wrapper is externally referenced (refcount > the single internal m_comWrapper reference). WSLCContainer::Release() wakes the idle worker when the last client proxy is released so the VM is reclaimed promptly. - Exec processes: the returned WSLCProcess wrapper (not retained internally) now carries a keep-alive activity token for its client-held lifetime, mirroring root-namespace processes. Adds WSLCE2E_VmIdle_HeldContainerProxyKeepsVmAlive covering a held exited-container proxy. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/windows/wslcsession/WSLCContainer.cpp | 48 ++++++++++++++++++++ src/windows/wslcsession/WSLCContainer.h | 22 +++++++++ src/windows/wslcsession/WSLCSession.cpp | 12 ++++- src/windows/wslcsession/WSLCSession.h | 13 +++--- test/windows/wslc/e2e/WSLCE2EVmIdleTests.cpp | 43 ++++++++++++++++++ 5 files changed, 130 insertions(+), 8 deletions(-) diff --git a/src/windows/wslcsession/WSLCContainer.cpp b/src/windows/wslcsession/WSLCContainer.cpp index fd7638fd4e..e9d9f54d2f 100644 --- a/src/windows/wslcsession/WSLCContainer.cpp +++ b/src/windows/wslcsession/WSLCContainer.cpp @@ -675,6 +675,21 @@ void WSLCContainerImpl::CopyTo(IWSLCContainer** Container) const THROW_IF_FAILED(m_comWrapper.CopyTo(Container)); } +bool WSLCContainerImpl::IsExternallyReferenced() const noexcept +{ + auto lock = m_lock.lock_shared(); + + // The impl owns exactly one reference to the COM wrapper (m_comWrapper); any additional + // references belong to clients holding marshaled proxies. A null wrapper means the container has + // already been disconnected, so there is nothing left to keep the VM alive for. + if (m_comWrapper == nullptr) + { + return false; + } + + return m_comWrapper->HasExternalReference(); +} + void WSLCContainerImpl::Attach(LPCSTR DetachKeys, WSLCHandle* Stdin, WSLCHandle* Stdout, WSLCHandle* Stderr) const { auto lock = m_lock.lock_shared(); @@ -1248,6 +1263,12 @@ void WSLCContainerImpl::Exec(const WSLCProcessOptions* Options, const WSLCProces } while (!control->GetExitEvent().wait(100)); auto process = wil::MakeOrThrow(std::move(control), std::move(io), Options->Flags); + + // The exec'd process wrapper is handed to the client and is not retained internally, so its + // lifetime tracks the client's proxy. Bind a keep-alive token to it so the idle worker does + // not tear the VM down (killing the process) while the client still holds the proxy. + process->SetKeepAliveToken(m_wslcSession.CreateActivityToken()); + THROW_IF_FAILED(process.CopyTo(__uuidof(IWSLCProcess), (void**)Process)); } CATCH_AND_THROW_DOCKER_USER_ERROR("Failed to exec process in container %hs", m_id.c_str()); @@ -2140,6 +2161,33 @@ WSLCContainer::WSLCContainer(WSLCContainerImpl* impl, WSLCSession& session, std: { } +ULONG STDMETHODCALLTYPE WSLCContainer::Release() +{ + const ULONG count = RuntimeClassBase::Release(); + + // A count of 1 means only WSLCContainerImpl::m_comWrapper (the single internal reference) is + // left, i.e. a client just released its last proxy. Wake the idle worker so the now-idle VM can + // be reclaimed. N.B. at count 0 the object has already been destroyed, so members (including + // m_session) must not be touched on that path. + if (count == 1) + { + m_session.RequestIdleCheck(); + } + + return count; +} + +bool WSLCContainer::HasExternalReference() noexcept +{ + // Read the current reference count without retaining a lasting reference. Call the base + // Release() directly (not the override above) so this query never triggers a spurious idle + // check, which would otherwise re-arm the idle worker in a busy loop. Safe because the caller + // (WSLCContainerImpl, via its m_comWrapper reference) guarantees the count cannot reach zero + // and destroy the object here. + AddRef(); + return RuntimeClassBase::Release() > 1; +} + HRESULT WSLCContainer::Attach(LPCSTR DetachKeys, WSLCHandle* Stdin, WSLCHandle* Stdout, WSLCHandle* Stderr) { WSLCExecutionContext context(&m_session); diff --git a/src/windows/wslcsession/WSLCContainer.h b/src/windows/wslcsession/WSLCContainer.h index c3ad79288c..9e63258ad5 100644 --- a/src/windows/wslcsession/WSLCContainer.h +++ b/src/windows/wslcsession/WSLCContainer.h @@ -110,6 +110,12 @@ class WSLCContainerImpl void CopyTo(IWSLCContainer** Container) const; + // Returns true if a client still holds a reference to this container's COM wrapper (i.e. the + // wrapper's reference count exceeds the single reference owned internally by the impl). Used by + // the idle worker so the VM is not torn down out from under an outstanding container proxy, + // which would otherwise leave the client with RPC_E_DISCONNECTED. + bool IsExternallyReferenced() const noexcept; + const std::string& Image() const noexcept; const std::string& Name() const noexcept; WSLCContainerState State() const noexcept; @@ -222,6 +228,9 @@ class DECLSPEC_UUID("B1F1C4E3-C225-4CAE-AD8A-34C004DE1AE4") WSLCContainer { public: + using RuntimeClassBase = + Microsoft::WRL::RuntimeClass, IWSLCContainer, IFastRundown, ISupportErrorInfo>; + WSLCContainer(WSLCContainerImpl* impl, WSLCSession& session, std::function&& OnDeleted); IFACEMETHOD(Attach)(_In_opt_ LPCSTR DetachKeys, _Out_ WSLCHandle* Stdin, _Out_ WSLCHandle* Stdout, _Out_ WSLCHandle* Stderr) override; @@ -244,6 +253,19 @@ class DECLSPEC_UUID("B1F1C4E3-C225-4CAE-AD8A-34C004DE1AE4") WSLCContainer IFACEMETHOD(InterfaceSupportsErrorInfo)(REFIID riid); + // RuntimeClass reference-count override. When a client releases its last proxy (leaving only the + // single internal reference owned by WSLCContainerImpl::m_comWrapper), wake the idle worker so + // the VM can be reclaimed. This pairs with WSLCContainerImpl::IsExternallyReferenced(), which + // keeps the VM alive while a client still holds a container proxy; without this signal the idle + // worker would never re-evaluate after the proxy was released and the VM would stay up forever. + ULONG STDMETHODCALLTYPE Release() override; + + // Returns true if a client still holds a reference to this wrapper, i.e. the reference count + // exceeds the single internal reference owned by WSLCContainerImpl::m_comWrapper. Reads the + // count via an AddRef + base Release round-trip that deliberately bypasses the Release() override + // above, so querying does not itself wake the idle worker. + bool HasExternalReference() noexcept; + // Cache read-only properties so they remain accessible after the impl is disconnected. // Called from WSLCContainerImpl::PrepareDisconnectComWrapper() while m_lock is held exclusively. void CacheState(const std::string& id, const std::string& name, WSLCContainerState state, const Microsoft::WRL::ComPtr& initProcess) noexcept; diff --git a/src/windows/wslcsession/WSLCSession.cpp b/src/windows/wslcsession/WSLCSession.cpp index 7644fffd3c..36fe67bb66 100644 --- a/src/windows/wslcsession/WSLCSession.cpp +++ b/src/windows/wslcsession/WSLCSession.cpp @@ -663,10 +663,18 @@ bool WSLCSession::HasActiveContainerLockHeld() std::lock_guard containersLock(m_containersLock); // A container in the Created or Running state keeps the VM alive (it is non-terminal and - // may still be started/used). Exited containers do not. + // may still be started/used). Exited containers do not -- unless a client still holds a proxy + // to the container's COM wrapper, in which case tearing the VM down would disconnect that proxy + // (RPC_E_DISCONNECTED). Keep the VM alive while any container is non-terminal or still + // externally referenced. return std::ranges::any_of(m_containers, [](const auto& entry) { const auto state = entry.second->State(); - return state == WslcContainerStateCreated || state == WslcContainerStateRunning; + if (state == WslcContainerStateCreated || state == WslcContainerStateRunning) + { + return true; + } + + return entry.second->IsExternallyReferenced(); }); } diff --git a/src/windows/wslcsession/WSLCSession.h b/src/windows/wslcsession/WSLCSession.h index f931b40f5a..a4514529b0 100644 --- a/src/windows/wslcsession/WSLCSession.h +++ b/src/windows/wslcsession/WSLCSession.h @@ -236,6 +236,13 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession // Safe to call from any thread, including IO relay / container callbacks. void RequestIdleCheck() noexcept; + // Creates an opaque activity token that holds a reference on this session's activity count for + // its lifetime, deferring idle teardown of the VM until every outstanding token is released. + // Used both for transient client operations (BeginContainerOperation) and to keep the VM alive + // for the lifetime of a process whose wrapper a client may keep (root-namespace and exec'd + // processes). + Microsoft::WRL::ComPtr CreateActivityToken(); + private: ULONG m_id = 0; @@ -269,12 +276,6 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession bool HasActiveContainerLockHeld(); void EnsureVmRunning(); - // Creates an opaque activity token that holds a reference on this session's activity count for - // its lifetime, deferring idle teardown of the VM until every outstanding token is released. - // Used both for transient client operations (BeginContainerOperation) and to keep the VM alive - // for the lifetime of a root-namespace process. - Microsoft::WRL::ComPtr CreateActivityToken(); - void IdleWorker(); bool IdleTerminationEnabled() const noexcept; void PersistSettings(const WSLCSessionInitSettings& Settings, PSID UserSid); diff --git a/test/windows/wslc/e2e/WSLCE2EVmIdleTests.cpp b/test/windows/wslc/e2e/WSLCE2EVmIdleTests.cpp index b88c212d2c..2c10b02698 100644 --- a/test/windows/wslc/e2e/WSLCE2EVmIdleTests.cpp +++ b/test/windows/wslc/e2e/WSLCE2EVmIdleTests.cpp @@ -148,6 +148,49 @@ class WSLCE2EVmIdleTests WaitForVmRunningState(session, false); } + // A client may hold a proxy to a container that has exited and is therefore no longer "active" + // by state. Tearing the VM down would disconnect that proxy (leaving the client with + // RPC_E_DISCONNECTED), so the idle worker must keep the VM alive while any container proxy is + // outstanding -- and reclaim it promptly once the client releases the proxy. This is the + // container analogue of the root-process keep-alive above. + WSLC_TEST_METHOD(WSLCE2E_VmIdle_HeldContainerProxyKeepsVmAlive) + { + auto session = TestSession::Create(L"wslc-vmidle-heldcontainer"); + + EnsureImageIsLoaded(AlpineImage, session.Name()); + + // Launch a container that exits almost immediately, then keep the returned proxy. Once it has + // exited it no longer counts as active by state, so only the held proxy can keep the VM up. + wsl::windows::common::WSLCContainerLauncher launcher( + wsl::shared::string::WideToMultiByte(AlpineImage.NameAndTag()), + "wslc-vmidle-heldcontainer", + {"/bin/true"}, + {}, + "none"); + + std::optional container = launcher.Launch(*session.Session(), WSLCContainerStartFlagsNone); + + // Exercise the pure proxy-release path (not container deletion) as the trigger for teardown. + container->SetDeleteOnClose(false); + + // Wait for the container to exit so it no longer keeps the VM alive by being Created/Running. + retry::RetryWithTimeout( + [&]() { THROW_HR_IF(E_FAIL, container->State() != WslcContainerStateExited); }, + std::chrono::milliseconds(250), + std::chrono::seconds(60)); + + // The VM must remain running well past the idle grace period (30s) while the exited + // container's proxy is held. Without the pin the idle worker would tear the VM down ~30s + // after the launch returned, so a generous margin past the grace period catches that + // regression reliably. + std::this_thread::sleep_for(std::chrono::seconds(40)); + VERIFY_IS_TRUE(static_cast(QueryDiagnostics(session).Running)); + + // Releasing the container proxy drops the last external reference and the VM idle-terminates. + container.reset(); + WaitForVmRunningState(session, false); + } + // arrives while teardown may still be in flight. All operations must succeed (no spurious // ERROR_INVALID_STATE from racing a VM that is stopping). WSLC_TEST_METHOD(WSLCE2E_VmIdle_ConcurrentRecreateDoesNotFail) From 4f5a23ca397711ffb9dd1e8a9a7f6a3f1cd65ecb Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Fri, 12 Jun 2026 13:11:11 -0700 Subject: [PATCH 21/22] Wake idle worker via captured idle state, not raw session ref WSLCContainer::Release() woke the idle worker by calling m_session.RequestIdleCheck(), but the wrapper can outlive the session (a client keeps a container proxy past releasing the session; the impl's baseline m_comWrapper ref is dropped during teardown while the client proxy survives) and can also be destroyed concurrently the instant our reference drops. Both make touching m_session after Release() a use-after-free. Bind an idle-check signaler at construction that captures the session's shared IdleState (shared_ptr), mirroring CreateActivityToken, and snapshot it on the stack before RuntimeClassBase::Release() so no member is touched on any post-Release path. Make WSLCSession::IdleState public for the comment reference. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/windows/wslcsession/WSLCContainer.cpp | 20 +++++++++++++++---- src/windows/wslcsession/WSLCContainer.h | 10 ++++++++++ src/windows/wslcsession/WSLCSession.h | 24 ++++++++++++----------- 3 files changed, 39 insertions(+), 15 deletions(-) diff --git a/src/windows/wslcsession/WSLCContainer.cpp b/src/windows/wslcsession/WSLCContainer.cpp index e9d9f54d2f..f3a2c61287 100644 --- a/src/windows/wslcsession/WSLCContainer.cpp +++ b/src/windows/wslcsession/WSLCContainer.cpp @@ -2159,19 +2159,31 @@ __requires_lock_held(m_lock) void WSLCContainerImpl::Transition(WSLCContainerSta WSLCContainer::WSLCContainer(WSLCContainerImpl* impl, WSLCSession& session, std::function&& OnDeleted) : COMImplClass(impl), m_session(session), m_onDeleted(std::move(OnDeleted)) { + // Bind the idle-check signaler to the session's shared idle state rather than to m_session, so + // Release() can wake the idle worker without dereferencing the (possibly torn-down) session. The + // captured shared_ptr keeps the idle state alive independently of the session's lifetime. + std::shared_ptr idleState = session.m_idleState; + m_requestIdleCheck = [idleState = std::move(idleState)]() { idleState->IdleCheckEvent.SetEvent(); }; } ULONG STDMETHODCALLTYPE WSLCContainer::Release() { + // Snapshot the signaler on the stack BEFORE dropping our reference. Once Release() returns, this + // object may already be gone: a concurrent owner of the last remaining reference (e.g. container + // deletion releasing WSLCContainerImpl::m_comWrapper) can destroy it, and the session itself may + // be torn down while a client still holds this proxy. The captured shared state keeps the wake + // valid in both cases, so we never touch a member after Release(). + const std::function requestIdleCheck = m_requestIdleCheck; + const ULONG count = RuntimeClassBase::Release(); // A count of 1 means only WSLCContainerImpl::m_comWrapper (the single internal reference) is // left, i.e. a client just released its last proxy. Wake the idle worker so the now-idle VM can - // be reclaimed. N.B. at count 0 the object has already been destroyed, so members (including - // m_session) must not be touched on that path. - if (count == 1) + // be reclaimed. N.B. at count 0 the object has already been destroyed; we deliberately signal + // only through the stack-local snapshot, never a member, on any post-Release path. + if (count == 1 && requestIdleCheck) { - m_session.RequestIdleCheck(); + requestIdleCheck(); } return count; diff --git a/src/windows/wslcsession/WSLCContainer.h b/src/windows/wslcsession/WSLCContainer.h index 9e63258ad5..00c5e82c9c 100644 --- a/src/windows/wslcsession/WSLCContainer.h +++ b/src/windows/wslcsession/WSLCContainer.h @@ -258,6 +258,8 @@ class DECLSPEC_UUID("B1F1C4E3-C225-4CAE-AD8A-34C004DE1AE4") WSLCContainer // the VM can be reclaimed. This pairs with WSLCContainerImpl::IsExternallyReferenced(), which // keeps the VM alive while a client still holds a container proxy; without this signal the idle // worker would never re-evaluate after the proxy was released and the VM would stay up forever. + // The wake is delivered through the captured m_idleState (not m_session) so it stays safe even + // if the wrapper outlives the session or is concurrently destroyed once our reference drops. ULONG STDMETHODCALLTYPE Release() override; // Returns true if a client still holds a reference to this wrapper, i.e. the reference count @@ -274,6 +276,14 @@ class DECLSPEC_UUID("B1F1C4E3-C225-4CAE-AD8A-34C004DE1AE4") WSLCContainer WSLCSession& m_session; std::function m_onDeleted; + // Wakes the idle worker when the last client proxy is released (see Release()). Bound at + // construction to a lambda that captures the session's shared idle state (WSLCSession::IdleState + // held via shared_ptr), so signalling never dereferences m_session: the wrapper can outlive the + // session (a client keeps this proxy past releasing the session) and can be concurrently + // destroyed the instant Release() drops our reference. The captured shared_ptr keeps the idle + // state alive and valid in both cases. + std::function m_requestIdleCheck; + // Cached read-only properties populated by CacheState() so they remain // accessible after the impl is disconnected. mutable wil::srwlock m_cacheLock; diff --git a/src/windows/wslcsession/WSLCSession.h b/src/windows/wslcsession/WSLCSession.h index a4514529b0..f8e0a2c907 100644 --- a/src/windows/wslcsession/WSLCSession.h +++ b/src/windows/wslcsession/WSLCSession.h @@ -243,6 +243,19 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession // processes). Microsoft::WRL::ComPtr CreateActivityToken(); + // Idle-activity state shared between the session and any outstanding activity tokens. Held via + // shared_ptr so a token (or a container COM wrapper) can outlive the session (e.g. a client + // keeps a root-namespace process or container proxy past releasing the session) and still + // safely release its activity reference / wake the idle worker without keeping the session + // object alive. Tearing down the session therefore proceeds normally; a late token release + // simply decrements the count and signals an event with no waiter. Public so the container COM + // wrapper (WSLCContainer) can hold a shared_ptr to it; see WSLCContainer::Release(). + struct IdleState + { + std::atomic ActivityCount{0}; + wil::unique_event IdleCheckEvent{wil::EventOptions::ManualReset}; + }; + private: ULONG m_id = 0; @@ -255,17 +268,6 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession Stopping, }; - // Idle-activity state shared between the session and any outstanding activity tokens. Held via - // shared_ptr so a token can outlive the session (e.g. a client keeps a root-namespace process - // proxy past releasing the session) and still safely release its activity reference without - // keeping the session object alive. Tearing down the session therefore proceeds normally; a - // late token release simply decrements the count and signals an event with no waiter. - struct IdleState - { - std::atomic ActivityCount{0}; - wil::unique_event IdleCheckEvent{wil::EventOptions::ManualReset}; - }; - _Requires_exclusive_lock_held_(m_lock) void StartVmLockHeld(); _Requires_exclusive_lock_held_(m_lock) From b87004431a5e5a4897948662afaebc0190cebf4a Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Fri, 12 Jun 2026 13:54:58 -0700 Subject: [PATCH 22/22] Align GetTerminationReason IDL comment with ERROR_INVALID_STATE behavior The comment implied callers could query the reason before termination and observe Unknown/empty, but the implementation returns ERROR_INVALID_STATE until the termination event is signaled. Document the actual contract. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/windows/service/inc/wslc.idl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/windows/service/inc/wslc.idl b/src/windows/service/inc/wslc.idl index 525857e57c..9bdf612e0c 100644 --- a/src/windows/service/inc/wslc.idl +++ b/src/windows/service/inc/wslc.idl @@ -799,9 +799,9 @@ interface IWSLCSession : IUnknown // caller and remains valid (and observes the signaled state) even after the session is released. HRESULT GetTerminationEvent([out, system_handle(sh_event)] HANDLE* Event); - // Returns the cached termination reason and details. The values are only meaningful after the - // termination event has been signaled; before that the reason is - // WSLCVirtualMachineTerminationReasonUnknown and Details is an empty string. + // Returns the cached termination reason and details. Only valid once the session has terminated, + // i.e. after the event returned by GetTerminationEvent is signaled; before that it returns + // HRESULT_FROM_WIN32(ERROR_INVALID_STATE). On success the caller owns Details and must free it. HRESULT GetTerminationReason([out] WSLCVirtualMachineTerminationReason* Reason, [out] LPWSTR* Details); // Reports on-demand VM lifecycle diagnostics. Does not bring the VM up or count as activity.