From 0522882c5fc9b43970f447e038a44e5adcb88aed Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Sun, 7 Jun 2026 13:04:54 -0700 Subject: [PATCH 1/5] Back system distro overlay with a per-instance scratch vhd The WSLg system distro runs from a read-only vhd with a writable overlay on top. That overlay's read/write layer was backed by tmpfs, so everything written into it (logs, temp files, copied-up files, build output) consumed guest memory and could spill into swap. Heavy writes -- e.g. compiling the Linux kernel in the system distro -- could exhaust RAM and swap and trigger the OOM killer VM-wide. Back the overlay read/write layer with a per-instance temporary ext4 "scratch" vhd (dynamically expanding, 64 GB cap) instead, mirroring the swap vhd. Writes now land on reclaimable disk page cache rather than pinned guest memory, and a runaway write gets a clean ENOSPC instead of an OOM kill. The host creates and attaches a scratch-.vhdx per instance when GUI apps are enabled, passes its LUN to the guest in LX_MINI_INIT_MESSAGE, and ejects + deletes it when the instance terminates (with the per-VM temp dir teardown as a backstop). The guest formats the device as ext4, mounts it, and bind-mounts a unique subdirectory as the overlay rw layer. If the scratch vhd cannot be created, attached, or mounted, the overlay transparently falls back to the previous tmpfs behavior so the distro still launches. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/linux/init/main.cpp | 128 ++++++++++++-------- src/linux/init/util.cpp | 88 +++++++++++++- src/linux/init/util.h | 10 +- src/shared/inc/lxinitshared.h | 4 +- src/windows/service/exe/LxssUserSession.cpp | 20 +++ src/windows/service/exe/WslCoreInstance.cpp | 5 + src/windows/service/exe/WslCoreInstance.h | 2 + src/windows/service/exe/WslCoreVm.cpp | 91 +++++++++++++- src/windows/service/exe/WslCoreVm.h | 8 ++ 9 files changed, 300 insertions(+), 56 deletions(-) diff --git a/src/linux/init/main.cpp b/src/linux/init/main.cpp index 8cc79c1fd2..ae5fa62d00 100644 --- a/src/linux/init/main.cpp +++ b/src/linux/init/main.cpp @@ -121,9 +121,9 @@ std::optional g_EnableSocketLogging; int Chroot(const char* Target); -void CreateSwap(unsigned int Lun); +std::optional FormatScratchDevice(unsigned int Lun); -int CreateTempDirectory(const char* ParentPath, std::string& Path); +void CreateSwap(unsigned int Lun); int DetachScsiDisk(unsigned int Lun); @@ -170,7 +170,8 @@ void LaunchSystemDistro( const char* SharedMemoryRoot, const char* InstallPath, const char* UserProfile, - pid_t DistroInitPid); + pid_t DistroInitPid, + unsigned int ScratchLun); std::map ListDiskPartitions(const std::string& DeviceName, std::optional WaitForIndex = {}); @@ -295,87 +296,96 @@ Return Value: return Fd; } -void CreateSwap(unsigned int Lun) +std::optional FormatScratchDevice(unsigned int Lun) /*++ Routine Description: - This routine sets up a swap area on the specified SCSI device. + This routine formats the scratch vhd used to back the system distro overlay's + read/write layer. + + N.B. This must run after the scratch device LUN is attached, but before the + overlay is created so the overlay can mount the device as its rw layer. Arguments: - Lun - Supplies the LUN number of the SCSI device. + Lun - Supplies the LUN of the scratch device, or UINT_MAX if none. Return Value: - None. + The scratch device path on success, std::nullopt on failure. --*/ +try { + if (Lun == UINT_MAX) + { + return std::nullopt; + } + + const std::string DevicePath = GetLunDevicePath(Lun); + WaitForBlockDevice(DevicePath.c_str()); + // - // Create the swap file asynchronously using the mkswap and swapon utilities in the system distro. + // Format the scratch device with ext4. // - // N.B. This is done because creating the swap file can take some time and - // the swap file does not need to be available immediately. + // N.B. This runs with the system distro as the root filesystem, so mkfs.ext4 + // and the scratch device node are directly reachable. + // + // N.B. The journal is omitted (the data is disposable) and the inode table is + // lazily initialized to keep formatting off the boot critical path. // - UtilCreateChildProcess("CreateSwap", [Lun]() { - std::string DevicePath = GetLunDevicePath(Lun); - - WaitForBlockDevice(DevicePath.c_str()); - - std::string CommandLine = std::format("/usr/sbin/mkswap '{}'", DevicePath); - THROW_LAST_ERROR_IF(UtilExecCommandLine(CommandLine.c_str(), nullptr) < 0); + const std::string CommandLine = std::format("/usr/sbin/mkfs.ext4 -q -m 0 -O ^has_journal -E lazy_itable_init=1 '{}'", DevicePath); + THROW_LAST_ERROR_IF(UtilExecCommandLine(CommandLine.c_str(), nullptr) < 0); - CommandLine = std::format("/usr/sbin/swapon '{}'", DevicePath); - UtilExecCommandLine(CommandLine.c_str(), nullptr); - }); + return DevicePath; +} +catch (...) +{ + LOG_CAUGHT_EXCEPTION(); + return std::nullopt; } -int CreateTempDirectory(const char* ParentPath, std::string& Path) +void CreateSwap(unsigned int Lun) /*++ Routine Description: - This routine creates a unique directory under the specified parent path. + This routine sets up a swap area on the specified SCSI device. Arguments: - ParentPath - Supplies the path of the parent directory. - - Path - Supplies a buffer to receive the path of the child directory that was - created. + Lun - Supplies the LUN number of the SCSI device. Return Value: - 0 on success, -1 on failure. + None. --*/ { - if (ParentPath) - { - Path = ParentPath; - } - // - // Generate a random name for the directory. + // Create the swap file asynchronously using the mkswap and swapon utilities in the system distro. // - // N.B. mkdtemp requires a template string that ends in "XXXXXX". + // N.B. This is done because creating the swap file can take some time and + // the swap file does not need to be available immediately. // - Path += "/wslXXXXXX"; + UtilCreateChildProcess("CreateSwap", [Lun]() { + std::string DevicePath = GetLunDevicePath(Lun); - if (mkdtemp(Path.data()) == NULL) - { - LOG_ERROR("mkdtemp({}) failed {}", Path.c_str(), errno); - return -1; - } + WaitForBlockDevice(DevicePath.c_str()); - return 0; + std::string CommandLine = std::format("/usr/sbin/mkswap '{}'", DevicePath); + THROW_LAST_ERROR_IF(UtilExecCommandLine(CommandLine.c_str(), nullptr) < 0); + + CommandLine = std::format("/usr/sbin/swapon '{}'", DevicePath); + UtilExecCommandLine(CommandLine.c_str(), nullptr); + }); } dev_t GetBlockDeviceNumber(const std::string& BlockDeviceName) @@ -1477,7 +1487,7 @@ try size_t TargetPathLength = strlen(Target); auto AddTemporaryMount = [&](const char* Name, const char* Source, unsigned long MountFlags) { std::string Path; - THROW_LAST_ERROR_IF(CreateTempDirectory(Target, Path) < 0); + THROW_LAST_ERROR_IF(UtilCreateTempDirectory(Target, Path) < 0); THROW_LAST_ERROR_IF(mount(Source, Path.c_str(), nullptr, MountFlags, nullptr) < 0); AddEnvironmentVariable(Name, Path.substr(TargetPathLength).data()); }; @@ -1665,7 +1675,8 @@ void LaunchSystemDistro( const char* SharedMemoryRoot, const char* InstallPath, const char* UserProfile, - pid_t DistroInitPid) + pid_t DistroInitPid, + unsigned int ScratchLun) /*++ @@ -1702,6 +1713,10 @@ Routine Description: DistroInitPid - Supplies the pid of the user distribution's init process. + ScratchLun - Supplies the SCSI LUN of the scratch device used to back the + overlay read/write layer, or UINT_MAX if none. When unavailable, the + overlay falls back to a tmpfs read/write layer. + Return Value: None. This method does not return. @@ -1711,10 +1726,26 @@ Return Value: try { // - // Create a writable layer on top of the read-only vhd. + // Format the scratch device (backed by a vhd) used to back the overlay + // read/write layer so it is disk-backed instead of consuming guest memory. + // On failure, the overlay transparently falls back to tmpfs. // - THROW_LAST_ERROR_IF(UtilMountOverlayFs(Target, SYSTEM_DISTRO_VHD_PATH) < 0); + const std::optional ScratchDevice = FormatScratchDevice(ScratchLun); + + // + // Create a writable layer on top of the read-only vhd. If the scratch-backed + // overlay fails to mount (e.g. the backing vhd is full or the rw layer setup + // fails), fall back to a tmpfs-backed overlay so the system distro can still + // launch. + // + + if (UtilMountOverlayFs(Target, SYSTEM_DISTRO_VHD_PATH, 0, {}, ScratchDevice) < 0) + { + THROW_LAST_ERROR_IF(!ScratchDevice.has_value()); + LOG_ERROR("scratch-backed overlay mount failed, falling back to tmpfs"); + THROW_LAST_ERROR_IF(UtilMountOverlayFs(Target, SYSTEM_DISTRO_VHD_PATH, 0, {}, std::nullopt) < 0); + } // // Launch the init daemon, this method does not return. @@ -1852,7 +1883,7 @@ try std::string MountPoint; if (Flags & LxMiniInitMessageFlagCreateOverlayFs) { - if (CreateTempDirectory(Target, MountPoint) < 0) + if (UtilCreateTempDirectory(Target, MountPoint) < 0) { return -1; } @@ -1940,7 +1971,7 @@ int MountSystemDistro(LX_MINI_INIT_MOUNT_DEVICE_TYPE DeviceType, unsigned int De Routine Description: This routine mounts the system distro as read-only, creates a writable - tmpfs layer using overlayfs, and chroots to the mount point. + overlayfs layer, and chroots to the mount point. Arguments: @@ -2301,7 +2332,8 @@ void ProcessLaunchInitMessage( wsl::shared::string::FromSpan(Buffer, Message->SharedMemoryRootOffset), wsl::shared::string::FromSpan(Buffer, Message->InstallPathOffset), wsl::shared::string::FromSpan(Buffer, Message->UserProfileOffset), - ChildPid); + ChildPid, + Message->ScratchLun); } } diff --git a/src/linux/init/util.cpp b/src/linux/init/util.cpp index 962e426a27..fa9d1ef63c 100644 --- a/src/linux/init/util.cpp +++ b/src/linux/init/util.cpp @@ -30,6 +30,7 @@ Module Name: #include #include #include +#include #include #include "common.h" #include "wslpath.h" @@ -1838,7 +1839,56 @@ Return Value: return 0; } -int UtilMountOverlayFs(const char* Target, const char* Lower, unsigned long MountFlags, std::optional TimeoutSeconds) +int UtilCreateTempDirectory(const char* ParentPath, std::string& Path) + +/*++ + +Routine Description: + + This routine creates a unique directory under the specified parent path. + +Arguments: + + ParentPath - Supplies the path of the parent directory. + + Path - Supplies a buffer to receive the path of the child directory that was + created. + +Return Value: + + 0 on success, -1 on failure. + +--*/ + +{ + if (ParentPath) + { + Path = ParentPath; + } + + // + // Generate a random name for the directory. + // + // N.B. mkdtemp requires a template string that ends in "XXXXXX". + // + + Path += "/wslXXXXXX"; + + if (mkdtemp(Path.data()) == NULL) + { + LOG_ERROR("mkdtemp({}) failed {}", Path.c_str(), errno); + return -1; + } + + return 0; +} + +int UtilMountOverlayFs( + const char* Target, + const char* Lower, + unsigned long MountFlags, + std::optional TimeoutSeconds, + const std::optional& ScratchDevice) /*++ @@ -1857,6 +1907,9 @@ Routine Description: TimeoutSeconds - Supplies an optional timeout if the mount should be retried. + ScratchDevice - Supplies an optional ext4 scratch block device used to back the + read/write layer. When empty, the read/write layer is backed by a tmpfs. + Return Value: 0 on success, < 0 on failure. @@ -1869,7 +1922,7 @@ try // Set up the state required for overlayfs mount: // // - mount point for read/write overlayfs (this happens last) - // /rw - tmpfs mount for upper and work dirs + // /rw - read/write layer (scratch device or tmpfs) for upper and work dirs // /rw/upper - upper dir // /rw/work - work dir // @@ -1882,14 +1935,38 @@ try auto Path = std::format("{}/rw", Target); // - // Create a tmpfs mount for the read/write layer + // Mount the read/write layer at /rw, backed either by the scratch + // device (disk-backed, reclaimable) or, when no scratch device is available, + // by a tmpfs (pinned guest memory). It is unwound on a later failure so the + // caller is not left with a half-constructed overlay. // - if (UtilMount(nullptr, Path.c_str(), "tmpfs", 0, nullptr) < 0) + const std::string rwPath = Path; + bool rwMounted = false; + auto cleanupRwLayer = wil::scope_exit([&]() { + if (rwMounted) + { + umount2(rwPath.c_str(), MNT_DETACH); + } + }); + + if (ScratchDevice.has_value()) { - return -1; + if (UtilMount(ScratchDevice->c_str(), Path.c_str(), "ext4", MS_NOATIME, nullptr) < 0) + { + return -1; + } + } + else + { + if (UtilMount(nullptr, Path.c_str(), "tmpfs", 0, nullptr) < 0) + { + return -1; + } } + rwMounted = true; + // // Create upper and work directories. // @@ -1913,6 +1990,7 @@ try return -1; } + cleanupRwLayer.release(); return 0; } CATCH_RETURN_ERRNO() diff --git a/src/linux/init/util.h b/src/linux/init/util.h index 8c819ecf33..9260851da1 100644 --- a/src/linux/init/util.h +++ b/src/linux/init/util.h @@ -251,11 +251,19 @@ int UtilMkdir(const char* Path, mode_t Mode); int UtilMkdirPath(const char* Path, mode_t Mode, bool SkipLast = false); +// Creates a uniquely-named directory under ParentPath, returning its path in Path. +int UtilCreateTempDirectory(const char* ParentPath, std::string& Path); + int UtilMountFile(const char* Source, const char* Destination); int UtilMount(const char* Source, const char* Target, const char* Type, unsigned long MountFlags, const char* Options, std::optional TimeoutSeconds = {}); -int UtilMountOverlayFs(const char* Target, const char* Lower, unsigned long MountFlags = 0, std::optional TimeoutSeconds = {}); +int UtilMountOverlayFs( + const char* Target, + const char* Lower, + unsigned long MountFlags = 0, + std::optional TimeoutSeconds = {}, + const std::optional& ScratchDevice = {}); int UtilOpenMountNamespace(void); diff --git a/src/shared/inc/lxinitshared.h b/src/shared/inc/lxinitshared.h index 9c9e3fde9f..61d5ce1e5e 100644 --- a/src/shared/inc/lxinitshared.h +++ b/src/shared/inc/lxinitshared.h @@ -1228,6 +1228,7 @@ typedef struct _LX_MINI_INIT_MESSAGE unsigned int UserProfileOffset; unsigned int Flags; unsigned int ConnectPort; + unsigned int ScratchLun; char Buffer[]; PRETTY_PRINT( @@ -1241,7 +1242,8 @@ typedef struct _LX_MINI_INIT_MESSAGE STRING_FIELD(InstallPathOffset), STRING_FIELD(UserProfileOffset), FIELD(Flags), - FIELD(ConnectPort)); + FIELD(ConnectPort), + FIELD(ScratchLun)); } LX_MINI_INIT_MESSAGE, *PLX_MINI_INIT_MESSAGE; diff --git a/src/windows/service/exe/LxssUserSession.cpp b/src/windows/service/exe/LxssUserSession.cpp index ba22e10e03..81d9b1f072 100644 --- a/src/windows/service/exe/LxssUserSession.cpp +++ b/src/windows/service/exe/LxssUserSession.cpp @@ -2591,6 +2591,16 @@ std::shared_ptr LxssUserSessionImpl::_CreateInstance(_In_op instanceId, configuration, LxMiniInitMessageLaunchInit, m_utilityVm->GetConfig().KernelBootTimeout, defaultUid, clientKey); } + // If instance startup fails after this point, ensure the per-instance overlay + // scratch vhd created in CreateInstance is ejected and deleted (the normal + // terminate path is not reached for an instance that never finishes starting). + auto scratchCleanupOnFailure = wil::scope_exit([&]() { + if (m_utilityVm) + { + m_utilityVm->CleanupInstanceScratch(instanceId); + } + }); + // Log telemetry to determine how long initialization takes. WSL_LOG( "InitializeInstanceBegin", @@ -2646,6 +2656,8 @@ std::shared_ptr LxssUserSessionImpl::_CreateInstance(_In_op cleanupOnFailure.release(); } + scratchCleanupOnFailure.release(); + result = S_OK; } catch (...) @@ -3603,9 +3615,11 @@ bool LxssUserSessionImpl::_TerminateInstanceInternal(_In_ LPCGUID DistroGuid, _I success = (success || force); if (success) { + std::optional scratchInstanceId; if (const auto* wslcoreInstance = dynamic_cast(instance->second.get()); wslcoreInstance != nullptr) { m_pluginManager.OnDistributionStopping(&m_session, wslcoreInstance->DistributionInformation()); + scratchInstanceId = wslcoreInstance->GetInstanceId(); } instance->second->Stop(); @@ -3620,6 +3634,12 @@ bool LxssUserSessionImpl::_TerminateInstanceInternal(_In_ LPCGUID DistroGuid, _I m_runningInstances.erase(instance); + // Eject and delete the per-instance overlay scratch vhd, if one was created. + if (scratchInstanceId.has_value() && m_utilityVm) + { + m_utilityVm->CleanupInstanceScratch(scratchInstanceId.value()); + } + // If the instance that was terminated was a WSL2 instance, // check if the VM is now idle. if (clientId != LXSS_CLIENT_ID_INVALID) diff --git a/src/windows/service/exe/WslCoreInstance.cpp b/src/windows/service/exe/WslCoreInstance.cpp index ba5c0a8235..a3261e5f35 100644 --- a/src/windows/service/exe/WslCoreInstance.cpp +++ b/src/windows/service/exe/WslCoreInstance.cpp @@ -329,6 +329,11 @@ GUID WslCoreInstance::GetDistributionId() const return m_configuration.DistroId; } +GUID WslCoreInstance::GetInstanceId() const +{ + return m_instanceId; +} + std::shared_ptr WslCoreInstance::GetInitPort() { THROW_HR_IF(HCS_E_TERMINATED, !m_initChannel); diff --git a/src/windows/service/exe/WslCoreInstance.h b/src/windows/service/exe/WslCoreInstance.h index bf2adc2cca..370a7bb6a9 100644 --- a/src/windows/service/exe/WslCoreInstance.h +++ b/src/windows/service/exe/WslCoreInstance.h @@ -92,6 +92,8 @@ class WslCoreInstance : public LxssRunningInstance GUID GetDistributionId() const override; + GUID GetInstanceId() const; + std::shared_ptr GetInitPort() override; std::shared_ptr GetSystemDistro(); diff --git a/src/windows/service/exe/WslCoreVm.cpp b/src/windows/service/exe/WslCoreVm.cpp index b658ce9abe..7721f74c34 100644 --- a/src/windows/service/exe/WslCoreVm.cpp +++ b/src/windows/service/exe/WslCoreVm.cpp @@ -45,6 +45,10 @@ using namespace std::string_literals; static constexpr size_t c_bootEntropy = 0x1000; static constexpr auto c_localDevicesKey = L"SOFTWARE\\Microsoft\\Terminal Server Client\\LocalDevices"; +// Virtual size of the dynamically-expanding scratch vhd that backs overlay +// read/write layers. The vhd grows on demand, so this is only an upper bound. +static constexpr ULONGLONG c_scratchVhdSizeBytes = 64ull * _1GB; + #define LXSS_ENABLE_GUI_APPS() (m_vmConfig.EnableGuiApps && (m_systemDistroDeviceId != ULONG_MAX)) using namespace wsl::windows::common; @@ -1195,12 +1199,55 @@ std::shared_ptr WslCoreVm::CreateInstance( #endif std::wstring userProfile{}; + ULONG scratchLun = ULONG_MAX; + auto scratchCleanup = wil::scope_exit([&]() { CleanupInstanceScratchLockHeld(InstanceId); }); if (LXSS_ENABLE_GUI_APPS() && (MessageType == LxMiniInitMessageLaunchInit)) { WI_SetFlag(flags, LxMiniInitMessageFlagLaunchSystemDistro); sharedMemoryRoot = m_sharedMemoryRoot; userProfile = m_userProfile; + + // Create and attach a per-instance scratch vhd to back the system distro overlay's + // read/write layer, so heavy writes land on reclaimable disk instead of guest memory. + // + // N.B. This can fail if the target directory is compressed, encrypted, or if the user + // does not have write access. On failure the guest falls back to a tmpfs overlay. + try + { + auto scratchPath = m_tempPath / std::format(L"scratch-{}.vhdx", wsl::shared::string::GuidToString(InstanceId)); + + // Eject and delete the scratch vhd if creation/attach succeeds but it is never + // successfully tracked (otherwise CleanupInstanceScratch / the temp dir teardown + // owns it). Released only once the vhd is recorded in m_instanceScratchVhds. + auto cleanupScratch = wil::scope_exit([&]() { + // Avoid advertising a lun that was torn down or never tracked. + scratchLun = ULONG_MAX; + + try + { + EjectVhdLockHeld(scratchPath.c_str()); + } + CATCH_LOG() + + try + { + const auto runAsUser = wil::impersonate_token(m_userToken.get()); + LOG_IF_WIN32_BOOL_FALSE(DeleteFileW(scratchPath.c_str())); + } + CATCH_LOG() + }); + + { + const auto runAsUser = wil::impersonate_token(m_userToken.get()); + wsl::core::filesystem::CreateVhd(scratchPath.c_str(), c_scratchVhdSizeBytes, &m_userSid.Sid, false, false); + } + + scratchLun = AttachDiskLockHeld(scratchPath.c_str(), DiskType::VHD, MountFlags::None, {}, false, m_userToken.get()); + m_instanceScratchVhds.emplace(InstanceId, scratchPath); + cleanupScratch.release(); + } + CATCH_LOG() } WI_SetFlagIf(flags, LxMiniInitMessageFlagExportCompressGzip, WI_IsFlagSet(ExportFlags, LXSS_EXPORT_DISTRO_FLAGS_GZIP)); @@ -1218,11 +1265,15 @@ std::shared_ptr WslCoreVm::CreateInstance( message.WriteString(message->SharedMemoryRootOffset, sharedMemoryRoot); message.WriteString(message->InstallPathOffset, installPath); message.WriteString(message->UserProfileOffset, userProfile); + message->ScratchLun = scratchLun; auto transaction = m_miniInitChannel.StartTransaction(); transaction.Send(message.Span()); - return CreateInstanceInternal( + auto instance = CreateInstanceInternal( InstanceId, Configuration, ReceiveTimeout, DefaultUid, ClientLifetimeId, WI_IsFlagSet(flags, LxMiniInitMessageFlagLaunchSystemDistro), ConnectPort); + + scratchCleanup.release(); + return instance; } std::shared_ptr WslCoreVm::CreateInstanceInternal( @@ -1388,6 +1439,44 @@ void WslCoreVm::EjectVhdLockHeld(_In_ PCWSTR VhdPath) } } +void WslCoreVm::CleanupInstanceScratch(_In_ const GUID& InstanceId) +{ + auto lock = m_lock.lock_exclusive(); + CleanupInstanceScratchLockHeld(InstanceId); +} + +_Requires_lock_held_(m_lock) +void WslCoreVm::CleanupInstanceScratchLockHeld(_In_ const GUID& InstanceId) +{ + const auto search = m_instanceScratchVhds.find(InstanceId); + if (search == m_instanceScratchVhds.end()) + { + return; + } + + const auto scratchPath = std::move(search->second); + m_instanceScratchVhds.erase(search); + + // Eject and delete the scratch vhd. + // + // N.B. The scratch device is only ever mounted inside the per-instance overlay mount + // namespace, which is destroyed when the instance terminates, so by this point no + // mount pins the device. Both steps are best-effort: any leftover is reclaimed when + // the per-VM temp directory is deleted on VM teardown. + try + { + EjectVhdLockHeld(scratchPath.c_str()); + } + CATCH_LOG() + + try + { + const auto runAsUser = wil::impersonate_token(m_userToken.get()); + LOG_IF_WIN32_BOOL_FALSE(DeleteFileW(scratchPath.c_str())); + } + CATCH_LOG() +} + _Requires_lock_held_(m_guestDeviceLock) std::optional WslCoreVm::FindVirtioFsShare(_In_ PCWSTR tag, _In_ std::optional Admin) const { diff --git a/src/windows/service/exe/WslCoreVm.h b/src/windows/service/exe/WslCoreVm.h index b3bb810b6f..6f4b9b7beb 100644 --- a/src/windows/service/exe/WslCoreVm.h +++ b/src/windows/service/exe/WslCoreVm.h @@ -91,6 +91,10 @@ class WslCoreVm void EjectVhd(_In_ PCWSTR VhdPath); + // Ejects and deletes the per-instance overlay scratch vhd (if any) that was + // created for the instance with the specified id. + void CleanupInstanceScratch(_In_ const GUID& InstanceId); + const wsl::core::Config& GetConfig() const noexcept; GUID GetRuntimeId() const; @@ -202,6 +206,9 @@ class WslCoreVm _Requires_lock_held_(m_lock) void EjectVhdLockHeld(_In_ PCWSTR VhdPath); + _Requires_lock_held_(m_lock) + void CleanupInstanceScratchLockHeld(_In_ const GUID& InstanceId); + _Requires_lock_held_(m_guestDeviceLock) std::optional FindVirtioFsShare(_In_ PCWSTR tag, _In_ std::optional Admin = {}) const; @@ -301,6 +308,7 @@ class WslCoreVm std::shared_ptr m_systemDistro; _Guarded_by_(m_lock) std::bitset m_lunBitmap; _Guarded_by_(m_lock) std::map m_attachedDisks; + _Guarded_by_(m_lock) std::map m_instanceScratchVhds; std::tuple m_kernelVersion; std::wstring m_kernelVersionString; bool m_seccompAvailable; From 60653871e73082cab66b9d331eee7af92b0dea85 Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Mon, 8 Jun 2026 09:29:44 -0700 Subject: [PATCH 2/5] Fix BrokenDistroImport test for per-instance scratch vhd The test hard-coded mkfs.ext4 /dev/sde for its bare-mounted 20MB disk. The per-instance system distro overlay scratch vhd now occupies an earlier /dev/sd* node, shifting the bare disk and causing the test to format the wrong device. Detect the disk by size instead, and promote MountTests' GetBlockDeviceInWsl helper to a shared Common.h/Common.cpp function. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- test/windows/Common.cpp | 43 +++++++++++++++++++++++++++++++++++++ test/windows/Common.h | 5 +++++ test/windows/MountTests.cpp | 43 ------------------------------------- test/windows/UnitTests.cpp | 7 +++++- 4 files changed, 54 insertions(+), 44 deletions(-) diff --git a/test/windows/Common.cpp b/test/windows/Common.cpp index f5aa3b1a0e..07a1fcd049 100644 --- a/test/windows/Common.cpp +++ b/test/windows/Common.cpp @@ -2474,6 +2474,49 @@ void Trim(std::wstring& string) std::erase_if(string, [](auto c) { return !isalnum(c); }); } +std::wstring GetBlockDeviceInWsl() +{ + // Wait for the disk to be attached + const auto timeout = std::chrono::steady_clock::now() + std::chrono::seconds(30); + + bool done = false; + while (true) + { + for (wchar_t name = 'a'; name < 'z'; name++) + { + std::wstring cmd = L"-u root blockdev --getsize64 /dev/sd"; + cmd += name; + + std::wstring out; + try + { + out = LxsstuLaunchWslAndCaptureOutput(cmd.data()).first; + } + CATCH_LOG() + + Trim(out); + + // Disk size is 20MB, so 20 * 1024 * 1024 bytes + if (out == L"20971520") + { + return std::wstring(L"/dev/sd") + name; + } + } + + if (done) + { + break; + } + + done = std::chrono::steady_clock::now() > timeout; + } + + VERIFY_FAIL(L"Failed to find the block device in WSL"); + + // Unreachable. + return {}; +} + ScopedEnvVariable::ScopedEnvVariable(const std::wstring& Name, const std::wstring& Value) : m_name(Name) { VERIFY_IS_TRUE(SetEnvironmentVariable(Name.c_str(), Value.c_str())); diff --git a/test/windows/Common.h b/test/windows/Common.h index ad2706ebf2..83234122d8 100644 --- a/test/windows/Common.h +++ b/test/windows/Common.h @@ -577,6 +577,11 @@ void TerminateDistribution(LPCWSTR DistributionName = LXSS_DISTRO_NAME_TEST_L); void Trim(std::wstring& string); +// Returns the /dev/sd* node of the 20MB test block device attached to the WSL VM, waiting up +// to 30s for it to appear. The device letter is not stable across runs (it shifts with the +// number of VHDs attached to the VM), so callers must look it up rather than hard-coding it. +std::wstring GetBlockDeviceInWsl(); + inline auto EnableSystemd(const std::string& extraConfig = "") { // enable systemd on the test distro by editing /etc/wsl.conf diff --git a/test/windows/MountTests.cpp b/test/windows/MountTests.cpp index 2a75ac0718..22c8c92927 100644 --- a/test/windows/MountTests.cpp +++ b/test/windows/MountTests.cpp @@ -943,49 +943,6 @@ class MountTests VERIFY_ARE_EQUAL(!offline, wsl::windows::common::disk::IsDiskOnline(disk.get())); } - static std::wstring GetBlockDeviceInWsl() - { - // Wait for the disk to be attached - const auto timeout = std::chrono::steady_clock::now() + std::chrono::seconds(30); - - bool done = false; - while (true) - { - for (wchar_t name = 'a'; name < 'z'; name++) - { - std::wstring cmd = L"-u root blockdev --getsize64 /dev/sd"; - cmd += name; - - std::wstring out; - try - { - out = LxsstuLaunchWslAndCaptureOutput(cmd.data()).first; - } - CATCH_LOG() - - Trim(out); - - // Disk size is 20MB, so 20 * 1024 * 1024 bytes - if (out == L"20971520") - { - return std::wstring(L"/dev/sd") + name; - } - } - - if (done) - { - break; - } - - done = std::chrono::steady_clock::now() > timeout; - } - - VERIFY_FAIL(L"Failed to find the block device in WSL"); - - // Unreachable. - return {}; - } - static bool IsBlockDevicePresent(const std::wstring& Device) { const auto Cmd = L"test -e " + Device; diff --git a/test/windows/UnitTests.cpp b/test/windows/UnitTests.cpp index c8aa0791d0..348b75fece 100644 --- a/test/windows/UnitTests.cpp +++ b/test/windows/UnitTests.cpp @@ -6596,7 +6596,12 @@ Error code: Wsl/InstallDistro/WSL_E_INVALID_JSON\r\n", LxsstuLaunchPowershellAndCaptureOutput(std::format(L"New-Vhd {} -SizeBytes 20MB", testVhd)); VERIFY_ARE_EQUAL(LxsstuLaunchWsl(std::format(L"--mount {} --vhd --bare", testVhd)), 0L); - VERIFY_ARE_EQUAL(LxsstuLaunchWsl(L"mkfs.ext4 /dev/sde"), 0L); + + // Locate the bare-mounted disk by its size rather than hard-coding a device node. + // The /dev/sd* letter depends on how many other VHDs are attached to the VM (e.g. + // the per-instance system distro overlay scratch vhd), which shifts the bare disk. + const auto bareDevice = GetBlockDeviceInWsl(); + VERIFY_ARE_EQUAL(LxsstuLaunchWsl((L"-u root mkfs.ext4 " + bareDevice).c_str()), 0L); VERIFY_ARE_EQUAL(LxsstuLaunchWsl(L"--unmount"), 0L); auto [out, err] = LxsstuLaunchWslAndCaptureOutput(std::format(L"--import-in-place broken-test-distro {}", testVhd), -1); From a1dfcb0bf351598fa567de5680357e7f698e4162 Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Mon, 8 Jun 2026 14:23:17 -0700 Subject: [PATCH 3/5] Address PR review: drop scratch vhd tracking map, fix device scan bound The per-instance scratch vhd path is deterministic from the instance id, so remove the m_instanceScratchVhds map and derive the path via GetInstanceScratchPath wherever it is needed (create, failure cleanup, terminate). This makes cleanup idempotent and removes the untracked-leak and const-std::move review findings. Also fix the block-device scan in the test helper to include /dev/sdz. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/windows/service/exe/WslCoreVm.cpp | 72 ++++++++++++--------------- src/windows/service/exe/WslCoreVm.h | 6 ++- test/windows/Common.cpp | 2 +- 3 files changed, 38 insertions(+), 42 deletions(-) diff --git a/src/windows/service/exe/WslCoreVm.cpp b/src/windows/service/exe/WslCoreVm.cpp index 7721f74c34..50bba0fb9d 100644 --- a/src/windows/service/exe/WslCoreVm.cpp +++ b/src/windows/service/exe/WslCoreVm.cpp @@ -1211,32 +1211,14 @@ std::shared_ptr WslCoreVm::CreateInstance( // Create and attach a per-instance scratch vhd to back the system distro overlay's // read/write layer, so heavy writes land on reclaimable disk instead of guest memory. // - // N.B. This can fail if the target directory is compressed, encrypted, or if the user - // does not have write access. On failure the guest falls back to a tmpfs overlay. + // N.B. The vhd path is derived from the instance id (GetInstanceScratchPath) and is not + // tracked: cleanup on failure here, on a later startup failure (scratchCleanup), and + // on terminate all recompute it. Creation can fail if the target directory is + // compressed, encrypted, or not writable by the user; on any failure the scratch is + // torn down and the guest falls back to a tmpfs overlay. try { - auto scratchPath = m_tempPath / std::format(L"scratch-{}.vhdx", wsl::shared::string::GuidToString(InstanceId)); - - // Eject and delete the scratch vhd if creation/attach succeeds but it is never - // successfully tracked (otherwise CleanupInstanceScratch / the temp dir teardown - // owns it). Released only once the vhd is recorded in m_instanceScratchVhds. - auto cleanupScratch = wil::scope_exit([&]() { - // Avoid advertising a lun that was torn down or never tracked. - scratchLun = ULONG_MAX; - - try - { - EjectVhdLockHeld(scratchPath.c_str()); - } - CATCH_LOG() - - try - { - const auto runAsUser = wil::impersonate_token(m_userToken.get()); - LOG_IF_WIN32_BOOL_FALSE(DeleteFileW(scratchPath.c_str())); - } - CATCH_LOG() - }); + const auto scratchPath = GetInstanceScratchPath(InstanceId); { const auto runAsUser = wil::impersonate_token(m_userToken.get()); @@ -1244,10 +1226,16 @@ std::shared_ptr WslCoreVm::CreateInstance( } scratchLun = AttachDiskLockHeld(scratchPath.c_str(), DiskType::VHD, MountFlags::None, {}, false, m_userToken.get()); - m_instanceScratchVhds.emplace(InstanceId, scratchPath); - cleanupScratch.release(); } - CATCH_LOG() + catch (...) + { + LOG_CAUGHT_EXCEPTION(); + + // Tear down any partially-created scratch and avoid advertising a torn-down lun; the + // guest falls back to a tmpfs overlay. + scratchLun = ULONG_MAX; + CleanupInstanceScratchLockHeld(InstanceId); + } } WI_SetFlagIf(flags, LxMiniInitMessageFlagExportCompressGzip, WI_IsFlagSet(ExportFlags, LXSS_EXPORT_DISTRO_FLAGS_GZIP)); @@ -1445,24 +1433,25 @@ void WslCoreVm::CleanupInstanceScratch(_In_ const GUID& InstanceId) CleanupInstanceScratchLockHeld(InstanceId); } +std::filesystem::path WslCoreVm::GetInstanceScratchPath(_In_ const GUID& InstanceId) const +{ + return m_tempPath / std::format(L"scratch-{}.vhdx", wsl::shared::string::GuidToString(InstanceId)); +} + _Requires_lock_held_(m_lock) void WslCoreVm::CleanupInstanceScratchLockHeld(_In_ const GUID& InstanceId) { - const auto search = m_instanceScratchVhds.find(InstanceId); - if (search == m_instanceScratchVhds.end()) - { - return; - } - - const auto scratchPath = std::move(search->second); - m_instanceScratchVhds.erase(search); - - // Eject and delete the scratch vhd. + // Eject and delete the per-instance overlay scratch vhd. The path is derived from the + // instance id, so this is idempotent and safe to call on any failure path and on terminate: + // ejecting a device that was never attached is a no-op, and deleting a file that was never + // created is ignored. // // N.B. The scratch device is only ever mounted inside the per-instance overlay mount // namespace, which is destroyed when the instance terminates, so by this point no - // mount pins the device. Both steps are best-effort: any leftover is reclaimed when - // the per-VM temp directory is deleted on VM teardown. + // mount pins the device. Any leftover is also reclaimed when the per-VM temp directory + // is deleted on VM teardown. + const auto scratchPath = GetInstanceScratchPath(InstanceId); + try { EjectVhdLockHeld(scratchPath.c_str()); @@ -1472,7 +1461,10 @@ void WslCoreVm::CleanupInstanceScratchLockHeld(_In_ const GUID& InstanceId) try { const auto runAsUser = wil::impersonate_token(m_userToken.get()); - LOG_IF_WIN32_BOOL_FALSE(DeleteFileW(scratchPath.c_str())); + if (!DeleteFileW(scratchPath.c_str()) && (GetLastError() != ERROR_FILE_NOT_FOUND)) + { + LOG_LAST_ERROR(); + } } CATCH_LOG() } diff --git a/src/windows/service/exe/WslCoreVm.h b/src/windows/service/exe/WslCoreVm.h index 6f4b9b7beb..1e2cba2854 100644 --- a/src/windows/service/exe/WslCoreVm.h +++ b/src/windows/service/exe/WslCoreVm.h @@ -209,6 +209,11 @@ class WslCoreVm _Requires_lock_held_(m_lock) void CleanupInstanceScratchLockHeld(_In_ const GUID& InstanceId); + // Returns the per-instance overlay scratch vhd path, derived from the instance id. The path + // is deterministic so the scratch vhd does not need to be tracked: every cleanup path + // recomputes it. + std::filesystem::path GetInstanceScratchPath(_In_ const GUID& InstanceId) const; + _Requires_lock_held_(m_guestDeviceLock) std::optional FindVirtioFsShare(_In_ PCWSTR tag, _In_ std::optional Admin = {}) const; @@ -308,7 +313,6 @@ class WslCoreVm std::shared_ptr m_systemDistro; _Guarded_by_(m_lock) std::bitset m_lunBitmap; _Guarded_by_(m_lock) std::map m_attachedDisks; - _Guarded_by_(m_lock) std::map m_instanceScratchVhds; std::tuple m_kernelVersion; std::wstring m_kernelVersionString; bool m_seccompAvailable; diff --git a/test/windows/Common.cpp b/test/windows/Common.cpp index 07a1fcd049..852b79defb 100644 --- a/test/windows/Common.cpp +++ b/test/windows/Common.cpp @@ -2482,7 +2482,7 @@ std::wstring GetBlockDeviceInWsl() bool done = false; while (true) { - for (wchar_t name = 'a'; name < 'z'; name++) + for (wchar_t name = 'a'; name <= 'z'; name++) { std::wstring cmd = L"-u root blockdev --getsize64 /dev/sd"; cmd += name; From 86ca0ce845ab2558177c0e1f11f19fd567e317ef Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Wed, 10 Jun 2026 10:01:09 -0700 Subject: [PATCH 4/5] Mount the scratch-backed system distro overlay volatile The per-instance scratch vhd backing the system distro overlay's read/write layer is reformatted on every launch and discarded on teardown, so the upper dir is disposable. Mount the overlay with the 'volatile' option (overlayfs >= 5.10) to skip syncs to the upper filesystem, avoiding writeback stalls on a layer whose contents never need to survive a crash. Volatile is only applied to the scratch-backed layer (a tmpfs rw layer gains nothing). If the running kernel rejects the option the mount is retried without it, so a disk-backed overlay is still used rather than falling back to tmpfs. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/linux/init/util.cpp | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/linux/init/util.cpp b/src/linux/init/util.cpp index fa9d1ef63c..23377780ed 100644 --- a/src/linux/init/util.cpp +++ b/src/linux/init/util.cpp @@ -1985,6 +1985,33 @@ try } MountOptions += std::format("workdir={}", Path); + + // + // The scratch-backed read/write layer is reformatted on every launch and discarded on + // teardown, so the overlay upper dir is disposable. Mount it "volatile" to skip syncs to the + // upper filesystem (supported since overlayfs 5.10), avoiding writeback stalls on a layer + // whose contents never need to survive a crash. overlayfs refuses to reuse a volatile upper + // dir after an unclean shutdown, but that is moot here because the upper dir is always freshly + // created above. A tmpfs read/write layer gains nothing from volatile, so it is only used for + // the scratch-backed layer. + // + // N.B. If the running kernel does not support the volatile option the mount fails with EINVAL; + // retry without it so a disk-backed overlay is still used rather than falling all the way + // back to a tmpfs layer. + // + + if (ScratchDevice.has_value()) + { + const auto VolatileOptions = MountOptions + ",volatile"; + if (UtilMount(nullptr, Target, "overlay", MountFlags, VolatileOptions.c_str(), TimeoutSeconds) >= 0) + { + cleanupRwLayer.release(); + return 0; + } + + LOG_ERROR("volatile overlay mount failed, retrying without volatile"); + } + if (UtilMount(nullptr, Target, "overlay", MountFlags, MountOptions.c_str(), TimeoutSeconds) < 0) { return -1; From 78cdb335d9d6956ad96ffe6f5e6277f1bfeaabe4 Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Wed, 10 Jun 2026 10:46:10 -0700 Subject: [PATCH 5/5] Address PR review: gate volatile retry, throttle device scan, harden temp dir Only retry the overlay mount without the volatile option when the kernel rejects it with EINVAL; surface any other failure so the caller falls back to a tmpfs read/write layer instead of masking a real error. Throttle GetBlockDeviceInWsl's rescan loop with a short sleep so it does not spin launching wsl.exe while the scratch disk attaches. Set UtilCreateTempDirectory's path explicitly so the result does not depend on prior buffer contents when ParentPath is null. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/linux/init/util.cpp | 28 ++++++++++++++++++++++------ test/windows/Common.cpp | 7 +++++++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/src/linux/init/util.cpp b/src/linux/init/util.cpp index 23377780ed..3d58ac75b2 100644 --- a/src/linux/init/util.cpp +++ b/src/linux/init/util.cpp @@ -1861,10 +1861,12 @@ Return Value: --*/ { - if (ParentPath) - { - Path = ParentPath; - } + // + // Set the parent path explicitly so the result does not depend on any prior contents of the + // caller's buffer when ParentPath is null. + // + + Path = ParentPath != nullptr ? ParentPath : ""; // // Generate a random name for the directory. @@ -2003,13 +2005,27 @@ try if (ScratchDevice.has_value()) { const auto VolatileOptions = MountOptions + ",volatile"; - if (UtilMount(nullptr, Target, "overlay", MountFlags, VolatileOptions.c_str(), TimeoutSeconds) >= 0) + const int VolatileResult = UtilMount(nullptr, Target, "overlay", MountFlags, VolatileOptions.c_str(), TimeoutSeconds); + if (VolatileResult >= 0) { cleanupRwLayer.release(); return 0; } - LOG_ERROR("volatile overlay mount failed, retrying without volatile"); + // + // A kernel that does not understand the "volatile" option fails the mount with EINVAL; + // only that warrants retrying without it. Any other failure (e.g. ENOSPC, EBUSY) would + // recur, so surface it to the caller, which falls back to a tmpfs read/write layer. + // + // N.B. UtilMount returns the negated errno on failure. + // + + if (VolatileResult != -EINVAL) + { + return -1; + } + + LOG_ERROR("volatile overlay mount unsupported (errno {}), retrying without volatile", -VolatileResult); } if (UtilMount(nullptr, Target, "overlay", MountFlags, MountOptions.c_str(), TimeoutSeconds) < 0) diff --git a/test/windows/Common.cpp b/test/windows/Common.cpp index 852b79defb..ab444610de 100644 --- a/test/windows/Common.cpp +++ b/test/windows/Common.cpp @@ -21,6 +21,7 @@ Module Name: #include #include #include +#include using namespace WEX::Logging; using namespace WEX::Common; @@ -2509,6 +2510,12 @@ std::wstring GetBlockDeviceInWsl() } done = std::chrono::steady_clock::now() > timeout; + if (!done) + { + // Wait briefly before rescanning so the helper does not spin launching wsl.exe in a + // tight loop (burning CPU and spawning a burst of subprocesses) while the disk attaches. + std::this_thread::sleep_for(std::chrono::milliseconds(250)); + } } VERIFY_FAIL(L"Failed to find the block device in WSL");