From 51f1c8e8964be5f668271708498d1005d635926d Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Thu, 11 Jun 2026 11:06:12 -0700 Subject: [PATCH 1/3] Rework the memory reduction thread around explicit reclaim helpers Replace the ring-buffer idle detector and user-CPU-only sampling in the mini-init memory reduction thread with a clearer, helper-based design: - Sample aggregate non-idle CPU time (user, system, irq, softirq, steal) so kernel-bound work keeps the VM out of the idle state, instead of looking at user time alone. - ReadProcFile reads a full procfs snapshot into a caller buffer (close-on-exec, partial-read safe); GetReclaimableCacheBytes / GetFreeMemoryBytes read the relevant counters through it. - Gradual mode reclaims cold page cache (cgroup memory.reclaim) above a fixed floor while CPU-idle, with a hysteresis margin so it does not churn near the floor. - DropCache mode stays gated on sustained CPU idle, drops once, and re-drops only after the reclaimable cache grows meaningfully. - Compaction is gated on free-memory growth so it runs only when there are newly-freed pages worth coalescing. RequestCgroupReclaim performs the memory.reclaim write best-effort: it treats the kernel's expected EAGAIN (some, but not all, pages evicted) as success without logging, and never throws so a transient write error cannot tear down the long-lived reduction thread. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/linux/init/util.cpp | 7813 ++++++++++++++++++++------------------- 1 file changed, 4072 insertions(+), 3741 deletions(-) diff --git a/src/linux/init/util.cpp b/src/linux/init/util.cpp index 962e426a2..3e2628c2f 100644 --- a/src/linux/init/util.cpp +++ b/src/linux/init/util.cpp @@ -1,3741 +1,4072 @@ -/*++ - -Copyright (c) Microsoft. All rights reserved. - -Module Name: - - util.c - -Abstract: - - This file utility function definitions. - ---*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "common.h" -#include "wslpath.h" -#include "util.h" -#include "drvfs.h" -#include "escape.h" -#include "config.h" -#include "mountutilcpp.h" -#include "message.h" -#include "RuntimeErrorWithSourceLocation.h" -#include "SocketChannel.h" -#include "Localization.h" - -#define INITIAL_MESSAGE_BUFFER_SIZE (0x1000) - -#define PLAN9_RDR_PREFIX "\\\\wsl.localhost\\" -#define PLAN9_RDR_COMPAT_PREFIX "\\\\wsl$\\" - -#define WSLENV_ENV "WSLENV" - -#define WSL_CGROUPS_FIELD_ENABLED (3) -#define WSL_CGROUPS_FIELD_MAX WSL_CGROUPS_FIELD_ENABLED -#define WSL_CGROUPS_FIELD_SEP '\t' -#define WSL_CGROUPS_FIELD_SUBSYSTEM (0) - -#define WSL_MOUNT_OPTION_SEP ',' - -int g_IsVmMode = -1; -static std::optional g_CachedFeatureFlags; -static sigset_t g_originalSignals; -thread_local std::string g_threadName; - -namespace wil { - -thread_local std::optional ScopedWarningsCollector::g_collectedWarnings; - -} - -int InteropServer::Create() - -/*++ - -Routine Description: - - This routine creates an interop server unix socket and starts listening on it. - -Arguments: - - None. - -Return Value: - - 0 on success, -1 on failure. - ---*/ - -{ - if (!m_InteropSocketPath.empty()) - { - LOG_ERROR("Interop server already created"); - return -1; - } - - // - // Generate a unique name to be used for the interop socket path. - // - - m_InteropSocketPath = std::format(WSL_INTEROP_SOCKET_FORMAT, WSL_TEMP_FOLDER, getpid(), WSL_INTEROP_SOCKET); - - // - // Ensure the WSL temp folder exists and has the correct mode. - // - - if (UtilMkdir(WSL_TEMP_FOLDER, WSL_TEMP_FOLDER_MODE) < 0) - { - return -1; - } - - // - // Create a unix socket to handle interop requests. - // - // N.B. This is done before the child process is created to ensure that - // the socket is ready for connections. - // - - m_InteropSocket.reset(socket(AF_UNIX, (SOCK_STREAM | SOCK_CLOEXEC), 0)); - if (!m_InteropSocket) - { - LOG_ERROR("socket failed {}", errno); - return -1; - } - - sockaddr_un InteropSocketAddress{}; - InteropSocketAddress.sun_family = AF_UNIX; - strncpy(InteropSocketAddress.sun_path, m_InteropSocketPath.c_str(), (sizeof(InteropSocketAddress.sun_path) - 1)); - - auto Result = bind(m_InteropSocket.get(), reinterpret_cast(&InteropSocketAddress), sizeof(InteropSocketAddress)); - if (Result < 0) - { - LOG_ERROR("bind failed {}", errno); - return -1; - } - - Result = listen(m_InteropSocket.get(), -1); - if (Result < 0) - { - LOG_ERROR("listen failed {}", errno); - return -1; - } - - // - // Ensure that any users can connect to the interop socket. - // - - Result = chmod(m_InteropSocketPath.c_str(), 0777); - if (Result < 0) - { - LOG_ERROR("chmod failed {}", errno); - return -1; - } - - return 0; -} - -wil::unique_fd InteropServer::Accept() const - -/*++ - -Routine Description: - - This routine accepts a connection on the interop server. - -Arguments: - - None. - -Return Value: - - The socket. - ---*/ - -{ - wil::unique_fd InteropConnection{accept4(m_InteropSocket.get(), nullptr, nullptr, SOCK_CLOEXEC)}; - if (!InteropConnection) - { - LOG_ERROR("accept4 failed {}", errno); - return {}; - } - - timeval Timeout{}; - Timeout.tv_sec = INTEROP_TIMEOUT_SEC; - if (setsockopt(InteropConnection.get(), SOL_SOCKET, SO_RCVTIMEO, &Timeout, sizeof(Timeout)) < 0) - { - LOG_ERROR("setsockopt(SO_RCVTIMEO) failed {}", errno); - } - - return InteropConnection; -} - -void InteropServer::Reset() -{ - if (!m_InteropSocketPath.empty()) - { - unlink(m_InteropSocketPath.c_str()); - m_InteropSocketPath = {}; - } -} - -InteropServer::~InteropServer() -{ - Reset(); -} - -int UtilAcceptVsock(int SocketFd, sockaddr_vm SocketAddress, int Timeout, int SocketFlags) - -/*++ - -Routine Description: - - This routine accepts a socket connection. - -Arguments: - - SocketFd - Supplies a socket file descriptor. - - SocketAddress - Supplies the socket address. This is passed by value instead - of by reference because accept4 modifies the structure to contain the - address of the peer socket. - - Timeout - Supplies a timeout. - - SocketFlags - Supplies the socket flags. - -Return Value: - - A file descriptor representing the socket, -1 on failure. - ---*/ - -{ - // - // If a timeout was specified, use a pollfd to wait for the accept. - // - - int Result = 0; - if (Timeout == -1) - { - pollfd PollDescriptor{SocketFd, POLLIN, 0}; - - while (true) - { - Result = poll(&PollDescriptor, 1, 60 * 1000); - if (Result < 0) - { - LOG_ERROR("poll({}) failed, {}", SocketFd, errno); - return Result; - } - else if ((Result == 0) || ((PollDescriptor.revents & POLLIN) == 0)) - { - LOG_ERROR("Waiting for abnormally long accept({})", SocketFd); - } - else - { - break; - } - } - } - else - { - pollfd PollDescriptor{SocketFd, POLLIN, 0}; - Result = poll(&PollDescriptor, 1, Timeout); - if ((Result <= 0) || ((PollDescriptor.revents & POLLIN) == 0)) - { - errno = ETIMEDOUT; - Result = -1; - } - } - - if (Result != -1) - { - socklen_t SocketAddressSize = sizeof(SocketAddress); - Result = accept4(SocketFd, reinterpret_cast(&SocketAddress), &SocketAddressSize, SocketFlags); - } - - if (Result < 0) - { - LOG_ERROR("accept4 failed {}", errno); - } - - return Result; -} - -int UtilBindVsockAnyPort(struct sockaddr_vm* SocketAddress, int Type) - -/*++ - -Routine Description: - - This routine creates a bound vsock socket an available port. - -Arguments: - - SocketAddress - Supplies a buffer to receive the socket address of the - socket. - - Type - Supplies the socket type. - -Return Value: - - A file descriptor representing the bound socket, -1 on failure. - ---*/ - -{ - int Result; - socklen_t SocketAddressSize; - int SocketFd; - - SocketFd = socket(AF_VSOCK, Type, 0); - if (SocketFd < 0) - { - Result = -1; - LOG_ERROR("socket failed {}", errno); - goto BindVsockAnyPortExit; - } - - memset(SocketAddress, 0, sizeof(*SocketAddress)); - SocketAddress->svm_family = AF_VSOCK; - SocketAddress->svm_cid = VMADDR_CID_ANY; - SocketAddress->svm_port = VMADDR_PORT_ANY; - SocketAddressSize = sizeof(*SocketAddress); - Result = bind(SocketFd, (const struct sockaddr*)SocketAddress, SocketAddressSize); - - if (Result < 0) - { - LOG_ERROR("bind failed {}", errno); - goto BindVsockAnyPortExit; - } - - // - // Query the socket name to get the assigned port. - // - - Result = getsockname(SocketFd, (struct sockaddr*)SocketAddress, &SocketAddressSize); - - if (Result < 0) - { - LOG_ERROR("getsockname failed {}", errno); - goto BindVsockAnyPortExit; - } - - Result = SocketFd; - SocketFd = -1; - -BindVsockAnyPortExit: - if (SocketFd != -1) - { - CLOSE(SocketFd); - } - - return Result; -} - -size_t UtilCanonicalisePathSeparator(char* Path, char Separator) - -/*++ - -Routine Description: - - This routine ensures all separators in Path use the specified separator. - -Arguments: - - Path - Supplies the path to canonicalise. - - Separator - Supplies the separator character to be used. - -Return Value: - - The size of the new string. - ---*/ - -{ - size_t DestIndex; - size_t PathLength; - size_t SourceIndex; - - DestIndex = 0; - SourceIndex = 0; - PathLength = strlen(Path); - - // - // Iterate through the path, replacing all separators. - // - - for (; SourceIndex < PathLength; SourceIndex++) - { - if (Path[SourceIndex] == PATH_SEP || Path[SourceIndex] == PATH_SEP_NT) - { - // - // Don't add a separator if previous char already is a separator. - // Also handle the special case where 'Path' is a UNC path (\\X or //X) - // where both separators should be kept. - // - - if (DestIndex > 1 && Path[DestIndex - 1] == Separator) - { - continue; - } - - Path[DestIndex] = Separator; - } - else - { - Path[DestIndex] = Path[SourceIndex]; - } - - DestIndex++; - } - - Path[DestIndex] = '\0'; - return DestIndex; -} - -void UtilCanonicalisePathSeparator(std::string& Path, char Separator) - -/*++ - -Routine Description: - - This routine ensures all separators in Path use the specified separator. - -Arguments: - - Path - Supplies the path to canonicalise. - - Separator - Supplies the separator character to be used. - -Return Value: - - None. - ---*/ - -{ - Path.resize(UtilCanonicalisePathSeparator(Path.data(), Separator)); -} - -wil::unique_fd UtilConnectToInteropServer(std::optional Pid) - -/*++ - -Routine Description: - - This routine connects to the interop server of the current client process. - -Arguments: - - Pid - Supplies an optional process ID to connect to. - -Return Value: - - A file descriptor representing the connected socket, -1 on failure. - ---*/ - -try -{ - char* InteropSocketPath; - std::string Path; - if (Pid.has_value()) - { - Path = std::format(WSL_INTEROP_SOCKET_FORMAT, WSL_TEMP_FOLDER, Pid.value(), WSL_INTEROP_SOCKET); - InteropSocketPath = Path.data(); - } - else - { - // - // Query the interop server environment variable. If the process does not - // have the environment variable, or if the socket does not exists, search through parent process tree for an - // interop server. - // - - InteropSocketPath = getenv(WSL_INTEROP_ENV); - if (InteropSocketPath == nullptr || (access(InteropSocketPath, F_OK) < 0 && errno == ENOENT)) - { - pid_t Parent = getppid(); - while (Parent > 0) - { - Path = std::format(WSL_INTEROP_SOCKET_FORMAT, WSL_TEMP_FOLDER, Parent, WSL_INTEROP_SOCKET); - if (access(Path.c_str(), F_OK) == 0) - { - InteropSocketPath = Path.data(); - break; - } - - Parent = UtilGetPpid(Parent); - } - - if (InteropSocketPath == nullptr) - { - return {}; - } - - setenv(WSL_INTEROP_ENV, InteropSocketPath, 1); - } - } - - // - // Connect to the server and return the connected socket to the caller. - // - - return UtilConnectUnix(InteropSocketPath); -} -CATCH_RETURN_ERRNO() - -wil::unique_fd UtilConnectUnix(const char* Path) - -/*++ - -Routine Description: - - This routine connects to the specified unix socket path. - -Arguments: - - Path - Supplies the path of the unix socket. - -Return Value: - - The connected socket, or a default-initialized value on failure. - ---*/ - -{ - wil::unique_fd Socket{socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0)}; - if (!Socket) - { - LOG_ERROR("socket failed {}", errno); - return {}; - } - - sockaddr_un SocketAddress{}; - SocketAddress.sun_family = AF_UNIX; - strncpy(SocketAddress.sun_path, Path, sizeof(SocketAddress.sun_path) - 1); - if (connect(Socket.get(), reinterpret_cast(&SocketAddress), sizeof(SocketAddress)) < 0) - { - LOG_ERROR("connect failed {}", errno); - return {}; - } - - return Socket; -} - -wil::unique_fd UtilConnectVsock(unsigned int Port, bool CloseOnExec, std::optional SocketBuffer, const std::source_location& Source) noexcept - -/*++ - -Routine Description: - - This routine connects to a vsock with the specified port. - -Arguments: - - Port - Supplies the port to connect to. - - CloseOnExec - Supplies a boolean specifying if the socket file descriptor should be closed on exec. - - SocketBuffer - Optionally supplies the size to use for the socket send and receive buffers. - - Source - Supplies the caller location. - -Return Value: - - A file descriptor representing the connected socket, -1 on failure. - ---*/ - -{ - int Type = SOCK_STREAM; - WI_SetFlagIf(Type, SOCK_CLOEXEC, CloseOnExec); - wil::unique_fd SocketFd{socket(AF_VSOCK, Type, 0)}; - if (!SocketFd) - { - LOG_ERROR("socket failed {} (from: {})", errno, Source); - return {}; - } - - // - // Set the socket connect timeout. - // - - timeval Timeout{}; - Timeout.tv_sec = LX_INIT_HVSOCKET_TIMEOUT_SECONDS; - if (setsockopt(SocketFd.get(), AF_VSOCK, SO_VM_SOCKETS_CONNECT_TIMEOUT, &Timeout, sizeof(Timeout)) < 0) - { - LOG_ERROR("setsockopt SO_VM_SOCKETS_CONNECT_TIMEOUT failed {}, (from: {})", errno, Source); - return {}; - } - - if (SocketBuffer) - { - int BufferSize = *SocketBuffer; - if (setsockopt(SocketFd.get(), SOL_SOCKET, SO_SNDBUF, &BufferSize, sizeof(BufferSize)) < 0) - { - LOG_ERROR("setsockopt(SO_SNDBUF, {}) failed {}, (from: {})", BufferSize, errno, Source); - return {}; - } - - if (setsockopt(SocketFd.get(), SOL_SOCKET, SO_RCVBUF, &BufferSize, sizeof(BufferSize)) < 0) - { - LOG_ERROR("setsockopt(SO_RCVBUF, {}) failed {}, (from: {})", BufferSize, errno, Source); - return {}; - } - } - - sockaddr_vm SocketAddress{}; - SocketAddress.svm_family = AF_VSOCK; - SocketAddress.svm_cid = VMADDR_CID_HOST; - SocketAddress.svm_port = Port; - if (connect(SocketFd.get(), (const struct sockaddr*)&SocketAddress, sizeof(SocketAddress)) < 0) - { - LOG_ERROR("connect port {} failed {} (from: {})", Port, errno, Source); - return {}; - } - - return SocketFd; -} - -int UtilCreateProcessAndWait(const char* const File, const char* const Argv[], int* Status, const std::map& Env, bool DetachTerminal) - -/*++ - -Routine Description: - - This routine creates a helper process from init and waits for it to exit. - -Arguments: - - File - Supplies the file name to execute. - - Argv - Supplies the arguments for the command. - - Status - Supplies an optional pointer that receives the exit status of the - process. - - DetachTerminal - Supplies a boolean that, when true, calls setsid() in the - child process to detach it from the controlling terminal. - -Return Value: - - 0 on success, -1 on failure. - ---*/ -{ - pid_t ChildPid; - int Result; - int LocalStatus; - pid_t WaitResult; - - Result = -1; - - // - // Init needs to not ignore SIGCHLD so it can wait for this child. - // - - auto restore = signal(SIGCHLD, SIG_DFL); - - ChildPid = fork(); - if (ChildPid < 0) - { - LOG_ERROR("Forking child process for {} failed with {}", File, errno); - goto CreateProcessAndWaitEnd; - } - - if (ChildPid == 0) - { - // - // Restore default signal dispositions for the child process. - // - - if (UtilSetSignalHandlers(g_SavedSignalActions, false) < 0 || UtilRestoreBlockedSignals() < 0) - { - _exit(-1); - } - - // - // Set environment variables. - // - - for (const auto& e : Env) - { - setenv(e.first.c_str(), e.second.c_str(), 1); - } - - // - // Detach from the controlling terminal if requested. - // - - if (DetachTerminal) - { - if (setsid() == -1) - { - LOG_ERROR("setsid failed {}", errno); - _exit(-1); - } - } - - // - // Invoke the executable. - // - - // This explicit cast is okay for now because: - // 1. execv function is guaranteed to not alter the arguments - // 2. In sometime we probably will replace most of these string constants - // with std::string anyway. - execv(File, const_cast(Argv)); - LOG_ERROR("execv({}) failed with {}", File, errno); - _exit(-1); - } - - if (Status == nullptr) - { - Status = &LocalStatus; - } - - // - // TODO_LX: Do we need a timeout when waiting for the process? - // - - WaitResult = waitpid(ChildPid, Status, 0); - if (WaitResult < 0) - { - LOG_ERROR("Waiting for {} failed with {}", File, errno); - goto CreateProcessAndWaitEnd; - } - - if (*Status != 0) - { - LOG_ERROR("{} failed with status {:#x}", File, *Status); - goto CreateProcessAndWaitEnd; - } - - Result = 0; - -CreateProcessAndWaitEnd: - - // - // Restore the disposition of SIGCHLD. - // - - signal(SIGCHLD, restore); - - return Result; -} - -int UtilExecCommandLine(const char* CommandLine, std::string* Output, int ExpectedStatus, bool PrintError) - -/*++ - -Routine Description: - - This routine runs the command and optionally returns the output. - -Arguments: - - CommandLine - Supplies the command line of the process to launch. - - Output - Supplies an optional pointer to a std::string to receive the output of the command. - If no buffer is provided the output will appear in stdout. - - ExpectedStatus - Supplies the expected return status of the command. - - PrintError - Supplies a boolean that specifies if an error should be printed if the process does not return the expected status. - -Return Value: - - 0 on success, -1 on failure. - ---*/ - -{ - // - // Exec the command and read the output. - // - - wil::unique_file Pipe{popen(CommandLine, "re")}; - if (!Pipe) - { - LOG_ERROR("popen({}) failed {}", CommandLine, errno); - return -1; - } - - std::vector Buffer(1024); - int Result = -1; - while (fgets(Buffer.data(), Buffer.size(), Pipe.get()) != nullptr) - { - if (Output) - { - (*Output) += Buffer.data(); - } - else - { - fputs(Buffer.data(), stdout); - } - } - - if (ferror(Pipe.get())) - { - Result = -1; - LOG_ERROR("fgets failed {}", errno); - goto ErrorExit; - } - - Result = 0; - -ErrorExit: - if (Pipe) - { - Result = pclose(Pipe.release()); - if (Result == -1) - { - LOG_ERROR("pclose failed {}", errno); - } - else - { - Result = UtilProcessChildExitCode(Result, CommandLine, ExpectedStatus, PrintError); - } - } - - return Result; -} - -std::string UtilFindMount(const char* MountInfoFile, const char* Path, bool WinPath, size_t* PrefixLength) - -/*++ - -Routine Description: - - This routine parses the /proc/self/mountinfo file to find a mount that - matches the specified path. - - N.B. The caller is responsible for freeing the returned replacement prefix - buffer. - -Arguments: - - MountInfoFile - Supplies the path to the mountinfo file. - - Path - Supplies the path. - - WinPath - Supplies a value that indicates whether the path is a Windows - path. - - PrefixLength - Supplies a pointer which receives the length of the prefix - that should be stripped from the path. - -Return Value: - - The replacement prefix on success, or an empty string on failure. - ---*/ - -try -{ - char** MatchField; - char** ReplacementField; - - mountutil::MountEnum MountEnum{MountInfoFile}; - if (WinPath != false) - { - MatchField = &MountEnum.Current().Source; - ReplacementField = &MountEnum.Current().MountPoint; - } - else - { - MatchField = &MountEnum.Current().MountPoint; - ReplacementField = &MountEnum.Current().Source; - } - - std::string FoundReplacement; - size_t FoundPrefixLength = 0; - while (MountEnum.Next()) - { - // - // If a mount point was previously found, and this mount point is a - // prefix of the path (or the previously found mount point, for Windows - // to Linux translation), it means that the path is not actually on - // the previously found mount, so discard that result. - // - // For example: - // - When translating /mnt/c/foo/bar, first /mnt/c is found, but a - // later entry indicates /mnt/c/foo is also a mount point (e.g. using - // tmpfs). This means /mnt/c/foo/bar is not on the /mnt/c mount. - // - When translating C:\foo, first /mnt/c is found. A later entry - // indicates /mnt itself is a mount point, making the earlier /mnt/c - // mount unreachable. - // - // TODO_LX: This doesn't catch the case when translating C:\foo\bar and - // /mnt/c/foo is a mount point. Handling that is more complicated. - // - - if (!FoundReplacement.empty()) - { - const char* LinuxPath = WinPath ? FoundReplacement.c_str() : Path; - size_t LinuxPrefixLength = UtilIsPathPrefix(LinuxPath, MountEnum.Current().MountPoint, false); - if (LinuxPrefixLength > 0) - { - FoundReplacement.resize(0); - } - } - - // - // For Plan 9, parse the actual mount source from the superblock options. - // For virtiofs, parse the mount source from source (for example drvfsC or drvfsaC). - // If the file system isn't Plan 9, virtiofs, or DrvFs, skip this mount. - // - - std::string MountSource; - if (strcmp(MountEnum.Current().FileSystemType, PLAN9_FS_TYPE) == 0) - { - MountSource = UtilParsePlan9MountSource(MountEnum.Current().SuperOptions); - if (MountSource.empty()) - { - continue; - } - - MountEnum.Current().Source = MountSource.data(); - } - else if (strcmp(MountEnum.Current().FileSystemType, VIRTIO_FS_TYPE) == 0) - { - MountSource = QueryVirtiofsMountSource(MountEnum.Current().Source); - if (MountSource.empty()) - { - continue; - } - - MountEnum.Current().Source = MountSource.data(); - } - else if (strcmp(MountEnum.Current().FileSystemType, DRVFS_FS_TYPE) == 0) - { - // - // The mount source is a Windows path and may use forward slashes; - // flip them to backslashes. - // - - UtilCanonicalisePathSeparator(MountEnum.Current().Source, PATH_SEP_NT); - } - else - { - continue; - } - - // - // Strip the trailing backslash if present. - // - - size_t Length = strlen(MountEnum.Current().Source); - if ((Length > 0) && (MountEnum.Current().Source[Length - 1] == PATH_SEP_NT)) - { - MountEnum.Current().Source[Length - 1] = '\0'; - } - - // - // For bind mounts, use the concatenation of the mount source and root - // of the mount as the mount source string. - // - - std::string CombinedMountSource; - if (strcmp(MountEnum.Current().Root, "/") != 0) - { - CombinedMountSource += MountEnum.Current().Source; - CombinedMountSource += MountEnum.Current().Root; - UtilCanonicalisePathSeparator(CombinedMountSource, PATH_SEP_NT); - MountEnum.Current().Source = CombinedMountSource.data(); - } - - // - // Check if the match field is a prefix of the path. - // - // N.B. For Windows paths, only matches longer than the existing match - // are considered. This is because Windows mounts aren't - // guaranteed to be in order and NTFS directory mounts should be - // preferred over plain drive letter mounts if they match. - // - - Length = UtilIsPathPrefix(Path, *MatchField, WinPath); - if ((Length == 0) || ((WinPath != false) && (Length < FoundPrefixLength))) - { - continue; - } - - // - // Store the length of the prefix so the caller can strip it from the - // string. - // - - FoundPrefixLength = Length; - - // - // Store the replacement. - // - - FoundReplacement = *ReplacementField; - - // - // Continue searching the file even if a mount has been found, since - // newer mounts could shadow this one or be a nested mount. - // - } - - if (!FoundReplacement.empty() && PrefixLength != nullptr) - { - *PrefixLength = FoundPrefixLength; - } - - return FoundReplacement; -} -catch (...) -{ - LOG_CAUGHT_EXCEPTION(); - return {}; -} - -std::optional UtilGetEnv(const char* Name, char* Environment) - -/*++ - -Routine Description: - - This queries the specified environment variable. - -Arguments: - - Name - Supplies the name to query. - - Environment - Supplies an environment block to search. If NULL is provided - the environment of the calling process is used. - -Return Value: - - The value of the specified environment variable, NULL if there is no match. - ---*/ - -{ - char* Current; - size_t Length; - size_t NameLength; - std::optional Value; - - if (Environment == nullptr) - { - const auto* EnvValue = getenv(Name); - if (EnvValue != nullptr) - { - Value = std::string{EnvValue}; - } - } - else - { - NameLength = strlen(Name); - for (size_t Index = 0;;) - { - Current = Environment + Index; - Length = strlen(Current); - if (Length == 0) - { - break; - } - - if ((strncmp(Current, Name, NameLength) == 0) && (Current[NameLength] == '=')) - { - Value = std::string{&Current[NameLength + 1]}; - break; - } - - Index += Length + 1; - } - } - - return Value; -} - -std::string UtilGetEnvironmentVariable(const char* Name) - -/*++ - -Routine Description: - - This queries the specified environment variable. If the value does not exist it gets the value from - the WSL interop server. - -Arguments: - - Name - Supplies the name to query. - -Return Value: - - The value of the specified environment variable if there is a match. - ---*/ - -try -{ - // - // Try to get the environment variable value. If it is not set, query the interop server for value. - // - - std::vector Buffer; - auto Value = getenv(Name); - if (Value == nullptr) - { - wsl::shared::SocketChannel channel{UtilConnectToInteropServer(), "InteropClient"}; - if (channel.Socket() < 0) - { - return {}; - } - - wsl::shared::MessageWriter Message(LxInitMessageQueryEnvironmentVariable); - Message.WriteString(Name); - - auto transaction = channel.StartTransaction(); - transaction.Send(Message.Span()); - - // - // Read a response, this will contain the environment variable value if it exists. - // - - Value = transaction.Receive().Buffer; - - // - // Set the environment variable for future queries. - // - - if (setenv(Name, Value, 1) < 0) - { - LOG_ERROR("setenv({}, {}, 1) failed {}", Name, Value, errno); - } - } - - return Value; -} -catch (...) -{ - LOG_CAUGHT_EXCEPTION(); - return {}; -} - -int UtilGetFeatureFlags() - -/*++ - -Routine Description: - - This routine gets the feature flags, either directly, from an environment - variable, or by querying it from the init process. - -Arguments: - - None. - -Return Value: - - The feature flags. - ---*/ - -{ - // - // If feature flags are already known, return them. - // - - if (g_CachedFeatureFlags) - { - return *g_CachedFeatureFlags; - } - - // - // Check if the environment variable is present. - // - // N.B. This is used for processes launched directly from init (e.g. - // mount.drvfs during initial configuration), because they may not be - // able to connect to init. - // - - int FeatureFlags = LxInitFeatureNone; - const char* FeatureFlagEnv = getenv(WSL_FEATURE_FLAGS_ENV); - if (FeatureFlagEnv != nullptr) - { - FeatureFlags = strtol(FeatureFlagEnv, nullptr, 16); - } - else - { - // - // Query init for the value. If an error occurs, just return no features. - // - - wsl::shared::SocketChannel channel{UtilConnectUnix(WSL_INIT_INTEROP_SOCKET), "wslinfo"}; - if (channel.Socket() < 0) - { - return FeatureFlags; - } - - MESSAGE_HEADER Message; - Message.MessageType = LxInitMessageQueryFeatureFlags; - Message.MessageSize = sizeof(Message); - - auto transaction = channel.StartTransaction(); - transaction.Send(Message); - FeatureFlags = transaction.Receive>().Result; - } - - UtilSetFeatureFlags(FeatureFlags, FeatureFlagEnv == nullptr); - return FeatureFlags; -} - -void UtilSetFeatureFlags(int FeatureFlags, bool UpdateEnv) - -/*++ - -Routine Description: - - This routine sets the feature flags and updates the cached value and environment variable. - -Arguments: - - FeatureFlags - Supplies the feature flags to set. - - UpdateEnv - Supplies a boolean that indicates whether the environment variable should be updated. - -Return Value: - - None. - ---*/ - -try -{ - g_CachedFeatureFlags = FeatureFlags; - if (UpdateEnv) - { - auto FeatureFlagsString = std::format("{:x}", FeatureFlags); - if (setenv(WSL_FEATURE_FLAGS_ENV, FeatureFlagsString.c_str(), 1) < 0) - { - LOG_ERROR("setenv({}, {}, 1) failed {}", WSL_FEATURE_FLAGS_ENV, FeatureFlagsString, errno); - } - } -} -CATCH_LOG() - -std::optional UtilGetNetworkingMode(void) - -/*++ - -Routine Description: - - This routine queries the networking mode from the init process. - -Arguments: - - None. - -Return Value: - - The networking mode if successful, std::nullopt otherwise. - ---*/ - -try -{ - wsl::shared::SocketChannel channel{UtilConnectUnix(WSL_INIT_INTEROP_SOCKET), "wslinfo"}; - THROW_LAST_ERROR_IF(channel.Socket() < 0); - - MESSAGE_HEADER Message; - Message.MessageType = LxInitMessageQueryNetworkingMode; - Message.MessageSize = sizeof(Message); - - auto transaction = channel.StartTransaction(); - transaction.Send(Message); - - const auto& response = transaction.Receive>(); - auto NetworkingMode = static_cast(response.Result); - - THROW_ERRNO_IF(EINVAL, NetworkingMode < LxMiniInitNetworkingModeNone || NetworkingMode > LxMiniInitNetworkingModeVirtioProxy); - - return NetworkingMode; -} -catch (...) -{ - LOG_CAUGHT_EXCEPTION(); - return {}; -} - -pid_t UtilGetPpid(pid_t Pid) - -/*++ - -Routine Description: - - This routine returns the parent process id of the specified process. - -Arguments: - - Pid - Supplies the process id to get the parent of. - -Return Value: - - The parent process id if successful, -1 otherwise. - ---*/ - -{ - // - // Open the /proc/[pid]/stat file. - // - - const auto FilePath = std::format("/proc/{}/stat", Pid); - std::ifstream File(FilePath); - - std::string Line; - if (!File || !std::getline(File, Line)) - { - return -1; - } - - // - // Parse the file. Sample format: "86 (bash) S 9". - // N.B. The second entry can contain a space so we can't just use strtok. - // - - const std::regex Pattern("^[0-9]+ \\(.*\\) \\w ([0-9]+).*"); - std::smatch Match; - if (!std::regex_match(Line, Match, Pattern) || Match.size() != 2) - { - LOG_ERROR("Failed to parse: {}, content: {}", FilePath, Line); - return -1; - } - - auto Result = strtol(Match.str(1).c_str(), nullptr, 10); - if (Result == 0) - { - LOG_ERROR("Failed to parse: {}, content: {}", FilePath, Line); - return -1; - } - - return Result; -} - -std::string UtilGetVmId(void) - -/*++ - -Routine Description: - - This routine queries the VM ID from the init process. - -Arguments: - - None. - -Return Value: - - The VM ID if successful, an empty string otherwise. - ---*/ - -try -{ - wsl::shared::SocketChannel channel{UtilConnectUnix(WSL_INIT_INTEROP_SOCKET), "wslinfo"}; - THROW_LAST_ERROR_IF(channel.Socket() < 0); - - wsl::shared::MessageWriter Message(LxInitMessageQueryVmId); - auto transaction = channel.StartTransaction(); - transaction.Send(Message.Span()); - - return transaction.Receive().Buffer; -} -catch (...) -{ - LOG_CAUGHT_EXCEPTION(); - return {}; -} - -void UtilInitGroups(const char* User, gid_t Gid) - -/*++ - -Routine Description: - - This routine initializes the groups for the current process. - N.B. This is needed because the musl version of initgroups has a hard-coded 32 group max. - -Arguments: - - User - Supplies the user name. - - Gid - Supplies the group id. - -Return Value: - - None. - ---*/ - -{ - if (initgroups(User, Gid) < 0) - { - int Count{}; - getgrouplist(User, Gid, nullptr, &Count); - std::vector Groups(Count); - THROW_LAST_ERROR_IF(getgrouplist(User, Gid, Groups.data(), &Count) < 0); - - THROW_LAST_ERROR_IF(setgroups(Count, Groups.data()) < 0); - } -} - -void UtilInitializeMessageBuffer(std::vector& Buffer) - -/*++ - -Routine Description: - - This routine ensures the supplied buffer is initialized. - -Arguments: - - Buffer - Supplies the buffer to be initialized. - -Return Value: - - None. - ---*/ - -{ - if (Buffer.size() < INITIAL_MESSAGE_BUFFER_SIZE) - { - Buffer.resize(INITIAL_MESSAGE_BUFFER_SIZE); - } -} - -bool UtilIsAbsoluteWindowsPath(const char* Path) - -/*++ - -Routine Description: - - This routine determines if the supplied path is an absolute Windows path. - -Arguments: - - Path - Supplies the path to check. - -Return Value: - - true if the supplied path is an absolute Windows path, false otherwise. - ---*/ - -{ - if ((strlen(Path) < 3) || (!(((Path[0] == PATH_SEP_NT || Path[0] == PATH_SEP) && (Path[1] == PATH_SEP_NT || Path[1] == PATH_SEP)) || - (isalpha(Path[0]) && Path[1] == DRIVE_SEP_NT)))) - { - return false; - } - - return true; -} - -size_t UtilIsPathPrefix(const char* Path, const char* Prefix, bool WinPath) - -/*++ - -Routine Description: - - This routine checks if one path is a prefix of another. - -Arguments: - - Path - Supplies the path to check for a prefix. - - Prefix - Supplies the prefix to check for. - - WinPath - Supplies a value that indicates whether Path is a Windows path. - -Return Value: - - The length of the prefix, or 0 if there is no match. - ---*/ - -{ - size_t PathLength; - size_t PrefixLength; - char Separator; - - if (WinPath != false) - { - Separator = PATH_SEP_NT; - } - else - { - Separator = PATH_SEP; - } - - // - // Check the lengths and make sure the next character is a separator. - // - - PathLength = strlen(Path); - PrefixLength = strlen(Prefix); - if ((PathLength < PrefixLength) || ((PathLength > PrefixLength) && (Path[PrefixLength] != Separator))) - { - return 0; - } - - // - // Check if the prefix matches. - // - // N.B. For Windows paths, this is done case-insensitive. - // - - if (!wsl::shared::string::StartsWith(Path, Prefix, WinPath)) - { - return 0; - } - - return PrefixLength; -} - -bool UtilIsUtilityVm(void) - -/*++ - -Routine Description: - - This routine determines if the current process is running in a Utility VM or - an WSL1 based instance. - -Arguments: - - None. - -Return Value: - - true if this instance is VM Mode, false otherwise. - ---*/ - -{ - // - // If this process has not yet checked, inspect the Linux kernel release - // string. - // - - if (g_IsVmMode == -1) - { - // - // The VM Mode kernel release contains "microsoft", the lxcore kernel - // contains "Microsoft". - // - // TODO: Come up with a different way to detect if we are running under - // lxcore versus a Utility VM. - // - - struct utsname UnameBuffer; - memset(&UnameBuffer, 0, sizeof(UnameBuffer)); - if (uname(&UnameBuffer) < 0) - { - FATAL_ERROR("uname failed {}", errno); - } - - g_IsVmMode = (strstr(UnameBuffer.release, "Microsoft") == NULL); - } - - return g_IsVmMode; -} - -int UtilListenVsockAnyPort(struct sockaddr_vm* Address, int Backlog, bool CloseOnExec) - -/*++ - -Routine Description: - - This routine creates a bound and listening vsock socket an available port. - -Arguments: - - Address - Supplies a buffer to receive the socket address of the socket. - - Backlog - Supplies the length of the backlog. - -Return Value: - - A file descriptor representing the listening socket, -1 on failure. - ---*/ - -{ - int Result; - int SocketFd; - - int flags = SOCK_STREAM; - WI_SetFlagIf(flags, SOCK_CLOEXEC, CloseOnExec); - - SocketFd = UtilBindVsockAnyPort(Address, flags); - if (SocketFd < 0) - { - Result = -1; - goto ListenVsockAnyPortExit; - } - - Result = listen(SocketFd, Backlog); - if (Result < 0) - { - LOG_ERROR("listen failed {}", errno); - goto ListenVsockAnyPortExit; - } - - Result = SocketFd; - SocketFd = -1; - -ListenVsockAnyPortExit: - if (SocketFd != -1) - { - CLOSE(SocketFd); - } - - return Result; -} - -int UtilMkdir(const char* Path, mode_t Mode) - -/*++ - -Routine Description: - - This routine ensures the directory exists. - -Arguments: - - Path - Supplies the path of the directory to create. - - Mode - Supplies the mode. - -Return Value: - - 0 on success, -1 on failure. - ---*/ - -{ - if ((mkdir(Path, Mode) < 0) && (errno != EEXIST)) - { - LOG_ERROR("mkdir({}, {:o}) failed {}", Path, Mode, errno); - return -1; - } - - return 0; -} - -int UtilMkdirPath(const char* Path, mode_t Mode, bool SkipLast) - -/*++ - -Routine Description: - - This routine ensures the directory exists. If necessary, all its parents - are created as well. - -Arguments: - - Path - Supplies the path of the directory to create. - - Mode - Supplies the mode. - - SkipLast - Indicates whether to skip creating the final entry in the path. - -Return Value: - - 0 on success, -1 on failure. - ---*/ - -{ - std::string LocalPath{Path}; - std::string::size_type Index = 0; - - // - // Because the search is always from index + 1, the first leading / is skipped. - // - - for (;;) - { - Index = LocalPath.find_first_of(PATH_SEP, Index + 1); - if (Index != std::string::npos) - { - LocalPath[Index] = '\0'; - } - else if (SkipLast) - { - break; - } - - if (UtilMkdir(LocalPath.c_str(), Mode) < 0) - { - return -1; - } - - if (Index == std::string::npos) - { - break; - } - - LocalPath[Index] = PATH_SEP; - } - - return 0; -} - -int UtilMountFile(const char* Source, const char* Destination) -try -{ - // Is the file is a symlink, delete it since that would break the mount. - if (std::filesystem::is_symlink(Destination)) - { - std::filesystem::remove(Destination); - } - - wil::unique_fd Fd{open(Destination, (O_CREAT | O_WRONLY), 0755)}; - THROW_LAST_ERROR_IF(!Fd); - - THROW_LAST_ERROR_IF(mount(Source, Destination, nullptr, (MS_RDONLY | MS_BIND), nullptr) < 0); - THROW_LAST_ERROR_IF(mount(nullptr, Destination, nullptr, (MS_RDONLY | MS_REMOUNT | MS_BIND), nullptr) < 0); - - return 0; -} -CATCH_RETURN_ERRNO(); - -int UtilMount(const char* Source, const char* Target, const char* Type, unsigned long MountFlags, const char* Options, std::optional TimeoutSeconds) - -/*++ - -Routine Description: - - This routine performs a mount with a retry and timeout. - -Arguments: - - Source - Supplies the source of the mount. - - Target - Supplies the target of the mount. - - Type - Supplies the filesystem type. - - MountFlags - Supplies mount flags. - - Options - Supplies the mount options. - - TimeoutSeconds - Supplies an optional retry timeout in seconds. - -Return Value: - - 0 on success, < 0 on failure. - ---*/ - -{ - // - // Ensure the mount point exists. - // - - if (UtilMkdirPath(Target, 0755) < 0) - { - return -1; - } - - // - // Mount the device to the mount point. - // - // N.B. The mount operation is retried if: - // - The mount source does not yet exist (hot-added devices) - // - For Plan9 (9p): device is busy or not found - // - For VirtioFS: invalid tag (device not ready) - // - // N.B. MS_SHARED must be applied in a separate mount() call, so it is - // stripped from the initial mount flags and applied after the mount. - // - - const unsigned long initialFlags = MountFlags & ~MS_SHARED; - - try - { - if (TimeoutSeconds.has_value()) - { - wsl::shared::retry::RetryWithTimeout( - [&]() { THROW_LAST_ERROR_IF(mount(Source, Target, Type, initialFlags, Options) < 0); }, - c_defaultRetryPeriod, - TimeoutSeconds.value(), - [&]() { - errno = wil::ResultFromCaughtException(); - - // Generic device not ready errors - if (errno == ENXIO || errno == EIO || errno == ENOENT) - { - return true; - } - - // Filesystem-specific device readiness errors - if (Type != nullptr) - { - if ((strcmp(Type, PLAN9_FS_TYPE) == 0 && errno == EBUSY) || (strcmp(Type, VIRTIO_FS_TYPE) == 0 && errno == EINVAL)) - { - return true; - } - } - - return false; - }); - } - else - { - THROW_LAST_ERROR_IF(mount(Source, Target, Type, initialFlags, Options) < 0); - } - } - catch (...) - { - errno = wil::ResultFromCaughtException(); - LOG_ERROR("mount({}, {}, {}, {:#x}, {}) failed {}", Source, Target, Type, MountFlags, Options, errno); - return -errno; - } - - // N.B. The shared flag must be applied in a separate mount() call. - if (WI_IsFlagSet(MountFlags, MS_SHARED)) - { - if (mount(nullptr, Target, nullptr, MS_SHARED, nullptr) < 0) - { - LOG_ERROR("Failed to make shared mount {} {}", Target, errno); - return -errno; - } - } - - return 0; -} - -int UtilMountOverlayFs(const char* Target, const char* Lower, unsigned long MountFlags, std::optional TimeoutSeconds) - -/*++ - -Routine Description: - - This routine mounts an overlayfs at the specified location. - -Arguments: - - Target - Supplies target for the overlayfs mount. - - Lower - Supplies the lower layer for the overlayfs mount. Multiple lower - layers can be specified by a colon-separated list. - - MountFlags - Supplies mount flags for the operation. - - TimeoutSeconds - Supplies an optional timeout if the mount should be retried. - -Return Value: - - 0 on success, < 0 on failure. - ---*/ - -try -{ - // - // Set up the state required for overlayfs mount: - // - // - mount point for read/write overlayfs (this happens last) - // /rw - tmpfs mount for upper and work dirs - // /rw/upper - upper dir - // /rw/work - work dir - // - - if (UtilMkdirPath(Target, 0755) < 0) - { - return -1; - } - - auto Path = std::format("{}/rw", Target); - - // - // Create a tmpfs mount for the read/write layer - // - - if (UtilMount(nullptr, Path.c_str(), "tmpfs", 0, nullptr) < 0) - { - return -1; - } - - // - // Create upper and work directories. - // - - Path = std::format("{}/rw/upper", Target); - if (UtilMkdir(Path.c_str(), 0755) < 0) - { - return -1; - } - - auto MountOptions = std::format("lowerdir={},upperdir={},", Lower, Path); - Path = std::format("{}/rw/work", Target); - if (UtilMkdir(Path.c_str(), 0755) < 0) - { - return -1; - } - - MountOptions += std::format("workdir={}", Path); - if (UtilMount(nullptr, Target, "overlay", MountFlags, MountOptions.c_str(), TimeoutSeconds) < 0) - { - return -1; - } - - return 0; -} -CATCH_RETURN_ERRNO() - -int UtilOpenMountNamespace(void) - -/*++ - -Routine Description: - - This routine opens a file descriptor to the current mount namespace. - -Arguments: - - None. - -Return Value: - - A file descriptor representing the current mount namespace, -1 on failure. - ---*/ - -{ - int Fd; - - Fd = open("/proc/self/ns/mnt", (O_RDONLY | O_CLOEXEC)); - if (Fd < 0) - { - LOG_ERROR("open failed {}", errno); - } - - return Fd; -} - -int UtilParseCgroupsLine(char* Line, char** SubsystemName, bool* Enabled) - -/*++ - -Routine Description: - - This routine parses a line from the /proc/cgroups file. The output - buffers will be pointers into the provided line buffer and does not need to - be freed. - - N.B. The Line buffer will be modified to insert NULL terminators. - -Arguments: - - Line - Supplies the line to parse. - - SubsystemName - Supplies a buffer to receive the subsystem name. - - Enabled - Supplies a buffer to receive true if the subsystem is enabled, - false otherwise. - -Return Value: - - 0 on success, -1 on failure. - ---*/ - -{ - char* Current; - int Field; - int Result; - - // - // Ignore comments. - // - - Current = strchr(Line, '#'); - if (Current != nullptr) - { - *Current = '\0'; - } - - for (Field = 0, Current = Line; ((Current != nullptr) && (Field <= WSL_CGROUPS_FIELD_MAX)); - Field += 1, Current = strchr(Current, WSL_CGROUPS_FIELD_SEP)) - { - // - // Replace field separators with NULL characters and skip past them. - // - - if (Field > 0) - { - *Current = '\0'; - Current += 1; - } - - switch (Field) - { - case WSL_CGROUPS_FIELD_SUBSYSTEM: - *SubsystemName = Current; - break; - - case WSL_CGROUPS_FIELD_ENABLED: - *Enabled = (*Current == '1'); - break; - } - } - - // - // Check if all the fields were found. If not, this is a malformed line. - // - - if (Field < WSL_CGROUPS_FIELD_MAX) - { - Result = -1; - goto ParseCgroupsLineEnd; - } - - Result = 0; - -ParseCgroupsLineEnd: - return Result; -} - -std::string UtilParsePlan9MountSource(std::string_view MountOptions) - -/*++ - -Routine Description: - - This routine parses the mount options to determine the actual source of a - a Plan 9 mount. - -Arguments: - - MountOptions - Supplies the mount options. - -Return Value: - - The mount source, or NULL if no valid option could be found. - ---*/ - -{ - // - // Search each option. - // - // N.B. The first option is always "ro" or "rw" so doesn't need to be - // considered. - // - - while (!MountOptions.empty()) - { - auto Current = UtilStringNextToken(MountOptions, WSL_MOUNT_OPTION_SEP); - if (wsl::shared::string::StartsWith(Current, PLAN9_ANAME_DRVFS)) - { - // - // Search for the sub path. - // - - auto MountSource = Current.substr(PLAN9_ANAME_DRVFS_LENGTH); - auto Index = Current.find(PLAN9_ANAME_PATH_OPTION); - if (Index == std::string_view::npos) - { - break; - } - - MountSource = Current.substr(Index + PLAN9_ANAME_PATH_OPTION_LENGTH); - MountSource = UtilStringNextToken(MountSource, PLAN9_ANAME_OPTION_SEP); - - // - // The value can only be used if it starts with a drive letter or - // the UNC prefix "UNC\" - // - - std::string Plan9Source; - if (MountSource.length() > 1 && isalpha(MountSource[0]) && MountSource[1] == DRIVE_SEP_NT) - { - Plan9Source = MountSource; - } - else if (wsl::shared::string::StartsWith(MountSource, PLAN9_UNC_TRANSLATED_PREFIX)) - { - Plan9Source = PLAN9_UNC_PREFIX; - Plan9Source += MountSource.substr(PLAN9_UNC_TRANSLATED_PREFIX_LENGTH); - } - else - { - break; - } - - // - // Ensure the returned path uses Windows path separators. - // - - UtilCanonicalisePathSeparator(Plan9Source, PATH_SEP_NT); - return Plan9Source; - } - } - - return {}; -} - -std::vector UtilParseWslEnv(char* NtEnvironment) - -/*++ - -Routine Description: - - This routine parses the WSLENV environment variable and constructs an - environment block with the resulting values. - -Arguments: - - NtEnvironment - Supplies an NT environment block. If NULL is provided, - the current processes's environment block is used. - -Return Value: - - The constructed env block. - ---*/ - -{ - std::optional EnvList; - bool Reverse = false; - - std::vector Output; - - auto Append = [&Output](const std::string_view& Content) { - for (auto e : Content) - { - Output.push_back(e); - } - }; - - Reverse = (NtEnvironment != nullptr); - - // - // Always add WSLENV to the block. - // - - Append(WSLENV_ENV "="); - - EnvList = UtilGetEnv(WSLENV_ENV, NtEnvironment); - if (EnvList.has_value()) - { - Append(EnvList.value()); - } - - Output.push_back('\0'); - if (EnvList.has_value()) - { - // - // Trim any whitespace from the end of the list. - // - - while (!EnvList->empty() && isspace(EnvList->back())) - { - EnvList->pop_back(); - } - - for (char *Sp, *EnvName = strtok_r(EnvList->data(), ":", &Sp); EnvName != nullptr; EnvName = strtok_r(NULL, ":", &Sp)) - { - char Mode = 0; - bool SkipTranslation = false; - char* Slash = strchr(EnvName, '/'); - if (Slash != nullptr) - { - *Slash = '\0'; - for (char* Flags = Slash + 1; *Flags != '\0'; Flags++) - { - switch (*Flags) - { - case 'p': // path - case 'l': // path list - if (Mode != 0 && Mode != 'p' && Mode != 'l') - { - SkipTranslation = true; - } - else - { - Mode = *Flags; - } - break; - - case 'u': // Win32 -> WSL translation only - if (Reverse == false) - { - SkipTranslation = true; - } - - break; - - case 'w': // WSL -> Win32 translation only - if (Reverse != false) - { - SkipTranslation = true; - } - - break; - - default: - - // - // Ignore entries with an unknown flag to support future - // extensibility. - // - - SkipTranslation = true; - break; - } - } - } - - auto EnvVal = UtilGetEnv(EnvName, NtEnvironment); - if (!SkipTranslation && EnvVal.has_value()) - { - switch (Mode) - { - case 'p': - case 'l': - { - - // - // Translate the path or path list. - // - - auto Result = UtilTranslatePathList(EnvVal->data(), Reverse); - if (Result.has_value()) - { - EnvVal = std::move(Result.value()); - } - else - { - SkipTranslation = true; - } - - break; - } - - default: - break; - } - } - - if (!SkipTranslation) - { - Append(std::format("{}={}", EnvName, EnvVal.value_or(""))); - Output.push_back('\0'); - } - } - } - - Output.push_back('\0'); - - return Output; -} - -int UtilProcessChildExitCode(int Status, const char* Name, int ExpectedStatus, bool PrintError) - -/*++ - -Routine Description: - - Handles the exit status of a child process. - -Arguments: - - Status - Supplies the exit status. - - Name - Supplies the process image name, for logging. - - ExpectedStatus - Supplies the expected exit status. - - PrintError - Supplies a boolean that specifies if an error should be printed if the process does not return the expected status. - -Return Value: - - 0 on success, -1 on failure. - ---*/ - -{ - if (WIFEXITED(Status)) - { - Status = WEXITSTATUS(Status); - if (Status == ExpectedStatus) - { - return 0; - } - } - else if (WIFSIGNALED(Status)) - { - LOG_ERROR("{} killed by signal {}", Name, WTERMSIG(Status)); - return -1; - } - - if (PrintError) - { - LOG_ERROR("{} returned {}", Name, Status); - } - - return -1; -} - -ssize_t UtilRead(int Fd, void* Buffer, size_t BufferSize, int Timeout) - -/*++ - -Routine Description: - - This routine reads a message from the given file descriptor with an optional - timeout. - -Arguments: - - Fd - Supplies a file descriptor. - - Buffer - Supplies a buffer. - - BufferSize - Supplies the size of the buffer in bytes. - - Timeout - Supplies a timeout in milliseconds. - -Return Value: - - The number of bytes read, -1 on failure. - ---*/ - -{ - // - // If a timeout was specified, use a pollfd. - // - - ssize_t Result = 0; - if (Timeout != -1) - { - pollfd PollDescriptor{Fd, POLLIN, 0}; - Result = poll(&PollDescriptor, 1, Timeout); - if ((Result <= 0) || ((PollDescriptor.revents & POLLIN) == 0)) - { - errno = ETIMEDOUT; - Result = -1; - } - } - - if (Result != -1) - { - Result = TEMP_FAILURE_RETRY(read(Fd, Buffer, BufferSize)); - } - - return Result; -} - -ssize_t UtilReadBuffer(int Fd, std::vector& Buffer, int Timeout) - -/*++ - -Routine Description: - - This routine reads a message from the given file descriptor and - automatically grows the buffer when needed. - -Arguments: - - Fd - Supplies a file descriptor. - - Buffer - Supplies a buffer; this buffer will be resized if needed. - - Timeout - Supplies a timeout in milliseconds. - -Return Value: - - The number of bytes read, -1 on failure. - ---*/ - -try -{ - ssize_t Result; - - UtilInitializeMessageBuffer(Buffer); - for (;;) - { - Result = UtilRead(Fd, Buffer.data(), Buffer.size(), Timeout); - if (Result < 0) - { - // - // When the message buffer is too small, EOVERFLOW is returned and - // the buffer size is doubled. - // - - if (errno == EOVERFLOW) - { - Buffer.resize(Buffer.size() * 2); - continue; - } - } - - break; - } - - return Result; -} -CATCH_RETURN_ERRNO() - -std::string UtilReadFile(FILE* File) - -/*++ - -Routine Description: - - This routine reads an entire file into a buffer. - -Arguments: - - File - Supplies a open file stream. - -Return Value: - - A std::string that contains the contents of the file. - ---*/ - -{ - char* Line = nullptr; - size_t LineLength; - std::string output; - - // - // Ensure the file is at the beginning of the stream. - // - - rewind(File); - - // - // Read the entire file into a buffer. - // - - LineLength = 0; - while (getline(&Line, &LineLength, File) != -1) - { - output += Line; - output += '\n'; - } - - return output; -} - -std::vector UtilReadFileRaw(const char* Path, size_t MaxSize) -{ - wil::unique_fd file{open(Path, O_RDONLY)}; - THROW_LAST_ERROR_IF(!file); - - constexpr auto bufferSize = 4096; - - size_t offset = 0; - std::vector buffer; - while (true) - { - buffer.resize(offset + bufferSize); - - int result = read(file.get(), buffer.data() + offset, bufferSize); - THROW_LAST_ERROR_IF(result < 0); - - if (result == 0) - { - break; - } - - offset += result; - - if (offset > MaxSize) - { - LOG_ERROR("File \"{}\" is too big. Maximum size: {}", Path, MaxSize); - THROW_ERRNO(E2BIG); - } - } - - buffer.resize(offset); - - return buffer; -} - -std::pair, std::optional> UtilReadFlavorAndVersion(const char* Path) -try -{ - // See reference format: https://www.freedesktop.org/software/systemd/man/latest/os-release.html - std::ifstream file; - file.open(Path); - - std::optional version; - std::optional flavor; - std::regex versionPattern("^VERSION_ID=\"?([a-zA-Z0-9\\-_\\.,]*)\"?$"); - std::regex flavorPattern("^ID=\"?([a-zA-Z0-9\\-_\\.,]*)\"?$"); - - std::string line; - while (file && std::getline(file, line) && (!version.has_value() || !flavor.has_value())) - { - std::smatch match; - if (std::regex_search(line, match, versionPattern)) - { - version = match.str(1); - } - else if (std::regex_search(line, match, flavorPattern)) - { - flavor = match.str(1); - } - } - - return std::make_pair(std::move(flavor), std::move(version)); -} -catch (...) -{ - LOG_CAUGHT_EXCEPTION(); - - return {}; -} - -ssize_t UtilReadMessageLxBus(int MessageFd, std::vector& Buffer, bool ShutdownOnDisconnect) - -/*++ - -Routine Description: - - This routine reads a message from the server. - -Arguments: - - MessageFd - Supplies a message port file descriptor. - - Buffer - Supplies a buffer; this buffer will be resized if needed. - - ShutdownOnDisconnect - Supplies true for shutdown on disconnect, false for - _exit. - -Return Value: - - The number of bytes read, -1 on failure. - ---*/ - -try -{ - UtilInitializeMessageBuffer(Buffer); - wil::unique_fd Epoll{epoll_create1(EPOLL_CLOEXEC)}; - if (!Epoll) - { - FATAL_ERROR("Failed to create epoll {}", errno); - } - - epoll_event EpollEvent{}; - EpollEvent.events = EPOLLIN | EPOLLHUP; - EpollEvent.data.fd = MessageFd; - if (epoll_ctl(Epoll.get(), EPOLL_CTL_ADD, MessageFd, &EpollEvent) < 0) - { - FATAL_ERROR("Failed epoll_ctl {}", errno); - } - - ssize_t BytesRead; - for (;;) - { - // - // Message port read/write operations are blocking, so use an epoll to - // allow any incoming signals to be processed while waiting for the - // message. - // - - if (TEMP_FAILURE_RETRY(epoll_wait(Epoll.get(), &EpollEvent, 1, -1)) != 1) - { - FATAL_ERROR("Failed epoll_wait {}", errno); - } - - BytesRead = TEMP_FAILURE_RETRY(read(MessageFd, Buffer.data(), Buffer.size())); - if (BytesRead >= static_cast(sizeof(MESSAGE_HEADER))) - { - break; - } - - if (BytesRead < 0) - { - // - // When the message buffer is too small, EOVERFLOW is returned and - // the buffer is increased. When the Windows server disconnects - // EPIPE is returned and the process handles the disconnect. - // - - if (errno == EOVERFLOW) - { - auto BufferSizeNew = *(reinterpret_cast(Buffer.data())); - if (BufferSizeNew <= Buffer.size()) - { - BufferSizeNew = Buffer.size() * 2; - } - - Buffer.resize(BufferSizeNew); - continue; - } - - if (errno == EPIPE) - { - if (ShutdownOnDisconnect == false) - { - _exit(0); - } - } - - FATAL_ERROR("Failed to read message {}", errno); - break; - } - - FATAL_ERROR("Unexpected message size {}", BytesRead); - break; - } - - return BytesRead; -} -CATCH_RETURN_ERRNO() - -int UtilRestoreBlockedSignals() -{ - return sigprocmask(SIG_SETMASK, &g_originalSignals, nullptr); -} - -int UtilSaveBlockedSignals(const sigset_t& SignalMask) -{ - return sigprocmask(SIG_BLOCK, &SignalMask, &g_originalSignals); -} - -// Returns true for signals that should not be saved/restored: -// SIGKILL/SIGSTOP — not settable per POSIX. -// SIGCONT — left at default to allow process resumption. -// SIGHUP — handled separately by the caller. -// 32-34 — internal NPTL signals (__SIGRTMIN through __SIGRTMIN+2) reserved -// by glibc for thread cancellation and other runtime use. -static bool SkipSignal(unsigned int Signal) -{ - switch (Signal) - { - case SIGKILL: - case SIGSTOP: - case SIGCONT: - case SIGHUP: - case 32: - case 33: - case 34: - return true; - - default: - return false; - } -} - -int UtilSaveSignalHandlers(struct sigaction* SavedSignalActions) - -/*++ - -Routine Description: - - This routine saves all settable signal handlers, skipping signals - listed in SkipSignal() (non-settable, SIGHUP, and internal NPTL signals). - -Arguments: - - SavedSignalActions - Supplies an array to save default signal actions. - -Return Value: - - 0 on success, -1 on failure. - ---*/ - -{ - for (unsigned int Index = 1; Index < _NSIG; Index += 1) - { - if (SkipSignal(Index)) - { - continue; - } - - if (sigaction(Index, NULL, &SavedSignalActions[Index]) < 0) - { - FATAL_ERROR("sigaction ({}) query failed {}", Index, errno); - } - } - - return 0; -} - -int UtilSetSignalHandlers(struct sigaction* SavedSignalActions, bool Ignore) - -/*++ - -Routine Description: - - This routine sets all settable signal handlers to the given handler, - skipping signals listed in SkipSignal(). - -Arguments: - - SavedSignalActions - Supplies an array of signal handlers to set. - - Ignore - Supplies a boolean specifying if signals should be ignored. - -Return Value: - - 0 on success, -1 on failure. - ---*/ - -{ - struct sigaction SignalAction; - - for (unsigned int Index = 1; Index < _NSIG; Index += 1) - { - if (SkipSignal(Index)) - { - continue; - } - - memcpy(&SignalAction, &SavedSignalActions[Index], sizeof(SignalAction)); - if (Ignore != false) - { - SignalAction.sa_handler = SIG_IGN; - } - - if (sigaction(Index, &SignalAction, NULL) < 0) - { - FATAL_ERROR("sigaction ({}) set failed {}", Index, errno); - } - } - - return 0; -} - -void UtilSetThreadName(const char* Name) -{ - g_threadName = Name; - - if (prctl(PR_SET_NAME, reinterpret_cast(Name), 0, 0, 0) < 0) - { - LOG_ERROR("prctl failed {}", errno); - } -} - -void UtilSocketShutdown(int Fd, int How) - -/*++ - -Routine Description: - - This routine cleanly shuts down a socket. - -Arguments: - - Fd - Supplies a socket file descriptor. - - How - Supplies the type of shutdown. - -Return Value: - - None. - ---*/ - -{ - if (shutdown(Fd, How) < 0) - { - LOG_ERROR("shutdown({}) failed {}", How, errno); - } -} - -bool UtilSizeTAdd(size_t Left, size_t Right, size_t* Out) - -/*++ - -Routine Description: - - This routine checks if overflow will occur when adding two size_t values and - if overflow is not possible, adds the values. - -Arguments: - - Left - Supplies the first value. - - Right - Supplies the second value. - - Out - Supplies a pointer to receive sum on success. - -Return Value: - - true if addition was done without overflow, false otherwise. - ---*/ - -{ - bool Success = false; - if (Right > (Right + Left)) - { - goto SizeTAddExit; - } - - *Out = Left + Right; - Success = true; - -SizeTAddExit: - return Success; -} - -std::string_view UtilStringNextToken(std::string_view& View, std::string_view Separators) - -/*++ - -Routine Description: - - This routine extracts the next token identified by one of the specified - separators. - -Arguments: - - View - Supplies the string view to tokenize. On return, this parameter is - modified to contain the remaining portion of the string after the - separator, or an empty string if no separator was found. - - Separators - Supplies the separators that appear between the tokens. - -Return Value: - - The contents of the string up to the next separator, or the entire string - if no separator was found. - ---*/ - -{ - std::string_view Result; - auto Pos = View.find_first_of(Separators); - if (Pos == std::string_view::npos) - { - Result = View; - View = {}; - } - else - { - Result = View.substr(0, Pos); - View = View.substr(Pos + 1); - } - - return Result; -} - -std::string_view UtilStringNextToken(std::string_view& View, char Separator) - -/*++ - -Routine Description: - - This routine extracts the next token identified by the specified separator. - -Arguments: - - View - Supplies the string view to tokenize. On return, this parameter is - modified to contain the remaining portion of the string after the - separator, or an empty string if no separator was found. - - Separators - Supplies the separator that appears between the tokens. - -Return Value: - - The contents of the string up to the next separator, or the entire string - if no separator was found. - ---*/ - -{ - std::string_view Result; - auto Pos = View.find_first_of(Separator); - if (Pos == std::string_view::npos) - { - Result = View; - View = {}; - } - else - { - Result = View.substr(0, Pos); - View = View.substr(Pos + 1); - } - - return Result; -} - -std::optional UtilTranslatePathList(char* PathList, bool IsNtPathList) - -/*++ - -Routine Description: - - This routine translates a semicolon-separated list of NT paths into a - colon-separated list of Linux paths. - -Arguments: - - PathList - Supplies the semicolon-separated list of paths to translate. - - IsNtPathList - Supplies a boolean specifying if the list contains NT-style - paths. - - LxPath - Supplies a buffer to receive an allocated string with the - translated path list. - -Return Value: - - 0 on success, -error for failure. - ---*/ - -{ - char Mode; - const char* SourceSeparator; - char TargetSeparator; - std::string TranslatedList; - - if (IsNtPathList != false) - { - Mode = TRANSLATE_MODE_UNIX; - SourceSeparator = ";"; - TargetSeparator = ':'; - } - else - { - Mode = TRANSLATE_MODE_WINDOWS; - SourceSeparator = ":"; - TargetSeparator = ';'; - } - - // - // Translate each element in the list. If an element in the list fails to - // translate, ignore it. - // - - for (char *Sp, *Path = strtok_r(PathList, SourceSeparator, &Sp); Path != nullptr; Path = strtok_r(NULL, SourceSeparator, &Sp)) - { - // - // Skip relative Windows paths. - // - - if ((Mode == TRANSLATE_MODE_UNIX) && (!UtilIsAbsoluteWindowsPath(Path))) - { - continue; - } - - std::string TranslatedPath = WslPathTranslate(Path, 0, Mode); - if (TranslatedPath.empty()) - { - auto WarningMessage = wsl::shared::Localization::MessageFailedToTranslate(Path); - if (wil::ScopedWarningsCollector::CanCollectWarning()) - { - EMIT_USER_WARNING(std::move(WarningMessage)); - } - else - { - LOG_WARNING("{}", WarningMessage); - } - - continue; - } - - if (!TranslatedList.empty()) - { - TranslatedList += TargetSeparator; - } - - TranslatedList += TranslatedPath; - } - - if (TranslatedList.empty()) - { - return {}; - } - - return TranslatedList; -} - -std::string UtilWinPathTranslate(const char* Path, bool Reverse) - -/*++ - -Routine Description: - - This routine translates an absolute Linux path or an absolute - Windows path into the other. - -Arguments: - - Path - Supplies the path to translate. - - Reverse - Supplies a bool, if set perform translation from Windows->Linux - path; otherwise, translation from Linux->Windows path. - -Return Value: - - 0 on success, -error for general failure. - ---*/ - -try -{ - size_t PathLength; - size_t PrefixLength; - char* Suffix; - size_t SuffixLength; - size_t TranslatedLength; - size_t TranslatedSuffixLength; - - // - // Find if there is a DrvFs or Plan 9 mount for the specified path. - // - - auto PrefixReplacement = UtilFindMount(MOUNT_INFO_FILE, Path, Reverse, &PrefixLength); - if (PrefixReplacement.empty()) - { - // - // The path is not part of a DrvFs or Plan 9 mount, so the path should - // be translated to use the plan9 redirector. - // - - return UtilWinPathTranslateInternal(Path, Reverse); - } - - // - // If translating Linux to Windows, see if any characters need to be - // escaped. - // - - PathLength = strlen(Path); - SuffixLength = PathLength - PrefixLength; - if (Reverse == false) - { - // - // If the suffix is empty and the replacement prefix is just a drive - // letter, make space to append a backslash. - // - - if ((SuffixLength == 0) && (PrefixReplacement.length() == 2) && (PrefixReplacement[1] == DRIVE_SEP_NT)) - { - TranslatedSuffixLength = 1; - } - else - { - TranslatedSuffixLength = EscapePathForNtLength(&Path[PrefixLength]); - } - } - else - { - TranslatedSuffixLength = SuffixLength; - } - - // - // Construct the new path out of the replacement prefix and the remainder - // of the path, escaping it if necessary. - // - - std::string TranslatedPath{PrefixReplacement}; - TranslatedLength = PrefixReplacement.length() + TranslatedSuffixLength; - TranslatedPath.resize(TranslatedLength, '\0'); - Suffix = &TranslatedPath[PrefixReplacement.length()]; - if (TranslatedSuffixLength != SuffixLength) - { - // - // If the suffix is empty and the replacement prefix is just a drive - // letter, append a backslash. Otherwise, escape and append the - // suffix. - // - - if ((SuffixLength == 0) && (TranslatedSuffixLength == 1)) - { - *Suffix = PATH_SEP_NT; - } - else - { - EscapePathForNt(&Path[PrefixLength], Suffix); - } - } - else - { - memcpy(Suffix, &Path[PrefixLength], SuffixLength); - - // - // Make sure the translated path uses the correct separators. - // - // N.B. This is done by the escape method if escaping is necessary. - // - - UtilCanonicalisePathSeparator(Suffix, Reverse ? PATH_SEP : PATH_SEP_NT); - } - - return TranslatedPath; -} -catch (...) -{ - LOG_CAUGHT_EXCEPTION(); - return {}; -} - -std::string UtilWinPathTranslateInternal(const char* Path, bool Reverse) - -/*++ - -Routine Description: - - This routine translates an absolute Linux path or an absolute - Windows path into the other. - -Arguments: - - Path - Supplies the path to translate. - - Reverse - Supplies a bool, if set perform translation from Windows->Linux - path; otherwise, translation from Linux->Windows path. - -Return Value: - - 0 on success, -1 on failure. - ---*/ - -try -{ - // - // Get the distribution name from the environment variable. - // - - const auto DistributionName = UtilGetEnvironmentVariable(WSL_DISTRO_NAME_ENV); - if (DistributionName.empty()) - { - return {}; - } - - // - // Construct a prefix (\\wsl.localhost\DistributionName). - // - - std::string Prefix{PLAN9_RDR_PREFIX}; - Prefix += DistributionName; - - // - // For Windows to Linux translation, concatenate the prefix and the escaped - // version of the path. For Linux to Windows translation, ensure the path - // begins with the prefix, remove the prefix, and unescape the path. - // - - std::string TranslatedPath{}; - if (Reverse == false) - { - TranslatedPath += Prefix; - const size_t EscapedPathLength = EscapePathForNtLength(Path); - std::string EscapedPath(EscapedPathLength, '\0'); - EscapePathForNt(Path, EscapedPath.data()); - TranslatedPath += EscapedPath; - } - else - { - auto matchesPrefix = [Path](const std::string_view& prefix) { - if (!wsl::shared::string::StartsWith(Path, prefix, true)) - { - return false; - } - - // Validate that the next character is a path separator or the end of the string to prevent matching other distribution paths like: - // \\wsl.localhost\- - - auto nextChar = Path[prefix.size()]; - return nextChar == '\0' || nextChar == PATH_SEP || nextChar == PATH_SEP_NT; - }; - - auto PrefixLength = Prefix.length(); - if (!matchesPrefix(Prefix)) - { - // - // Check the old \\wsl$ prefix if it's not \\wsl.localhost. - // - - std::string CompatPrefix{PLAN9_RDR_COMPAT_PREFIX}; - CompatPrefix += DistributionName; - if (!matchesPrefix(CompatPrefix)) - { - return {}; - } - - PrefixLength = CompatPrefix.length(); - } - - Path += PrefixLength; - if (strlen(Path) == 0) - { - TranslatedPath += PATH_SEP; - } - else - { - TranslatedPath += Path; - - // - // Canonicalize the path separators and unescape the string. - // - - UtilCanonicalisePathSeparator(TranslatedPath.data(), PATH_SEP); - UnescapePathInplace(TranslatedPath.data()); - } - } - - return TranslatedPath; -} -catch (...) -{ - LOG_CAUGHT_EXCEPTION(); - return {}; -} - -ssize_t UtilWriteBuffer(int Fd, gsl::span Buffer) - -/*++ - -Routine Description: - - This routine writes an entire buffer to the given file descriptor. - -Arguments: - - Fd - Supplies a file descriptor. - - Buffer - Supplies the buffer to write. - -Return Value: - - The total number of bytes written, -1 on failure. - ---*/ - -{ - return UtilWriteBuffer(Fd, Buffer.data(), Buffer.size()); -} - -ssize_t UtilWriteBuffer(int Fd, const void* Buffer, size_t BufferSize) - -/*++ - -Routine Description: - - This routine writes an entire buffer to the given file descriptor. - -Arguments: - - Fd - Supplies a file descriptor. - - Buffer - Supplies a buffer pointer. - - BufferSize - Supplies the buffer size. - -Return Value: - - The total number of bytes written, -1 on failure. - ---*/ - -{ - ssize_t BytesWritten; - ssize_t Result; - ssize_t TotalBytesWritten; - auto* Offset = static_cast(Buffer); - - Result = -1; - TotalBytesWritten = 0; - do - { - BytesWritten = TEMP_FAILURE_RETRY(write(Fd, Offset, BufferSize)); - if (BytesWritten < 0) - { - goto WriteBufferExit; - } - - BufferSize -= BytesWritten; - Offset += BytesWritten; - TotalBytesWritten += BytesWritten; - } while (BufferSize > 0); - - Result = TotalBytesWritten; - -WriteBufferExit: - return Result; -} - -ssize_t UtilWriteStringView(int Fd, std::string_view StringView) - -/*++ - -Routine Description: - - This routine writes a string view to the given file descriptor. - -Arguments: - - Fd - Supplies a file descriptor. - - StringView - Supplies the string view to write. - -Return Value: - - The total number of bytes written, -1 on failure. - ---*/ - -{ - return UtilWriteBuffer(Fd, StringView.data(), StringView.size()); -} - -std::wstring UtilReadFileContentW(std::string_view path) -{ - std::wifstream file; - file.exceptions(std::ifstream::failbit | std::ifstream::badbit); - file.open(path); - - return {std::istreambuf_iterator(file), {}}; -} - -std::string UtilReadFileContent(std::string_view path) -{ - std::ifstream file; - file.exceptions(std::ifstream::failbit | std::ifstream::badbit); - file.open(path); - - return {std::istreambuf_iterator(file), {}}; -} - -HvPciSwiotlbPool UtilReadHvPciSwiotlbPool() -{ - HvPciSwiotlbPool pool{}; - try - { - pool.Base = std::stoull(UtilReadFileContent("/sys/bus/vmbus/drivers/hv_pci/swiotlb_base"), nullptr, 0); - pool.Size = std::stoull(UtilReadFileContent("/sys/bus/vmbus/drivers/hv_pci/swiotlb_size"), nullptr, 0); - } - catch (...) - { - pool = {}; - } - - return pool; -} - -uint16_t UtilWinAfToLinuxAf(uint16_t WinAddressFamily) -{ - uint16_t LinuxAddressFamily = AF_UNSPEC; - - switch (WinAddressFamily) - { - case 2: - LinuxAddressFamily = AF_INET; - break; - case 23: - LinuxAddressFamily = AF_INET6; - break; - } - - return LinuxAddressFamily; -} - -int WriteToFile(const char* Path, const char* Content, int OpenFlags, int Permissions) - -/*++ - -Routine Description: - - Write content to the specified file. - -Arguments: - - Path - Supplies the path to the file to write. - - Content - Supplies the content to be written to the file. - - OpenFlags - Supplies the flags passed to open(). - - Permissions - Supplies the file mode used when O_CREAT causes the file to be created. - -Return Value: - - 0 on success, -1 on failure. - ---*/ - -{ - wil::unique_fd Fd{open(Path, OpenFlags, Permissions)}; - if (!Fd) - { - int errnoPrev = errno; - LOG_ERROR("open({}) failed {}", Path, errno); - errno = errnoPrev; - return -1; - } - - std::string_view Buffer{Content}; - auto Result = UtilWriteStringView(Fd.get(), Buffer); - if (Result != Buffer.size()) - { - int errnoPrev = errno; - LOG_ERROR("write({}, {}) failed {} {}", Path, Content, Result, errno); - errno = errnoPrev; - return -1; - } - - return 0; -} - -int ProcessCreateProcessMessage(wsl::shared::Transaction& Transaction, gsl::span Buffer) -{ - auto* Message = gslhelpers::try_get_struct(Buffer); - if (!Message) - { - LOG_ERROR("Unexpected message size {}", Buffer.size()); - return -1; - } - - auto sendResult = [&](unsigned long Result) { Transaction.SendResultMessage(Result); }; - - sockaddr_vm SocketAddress{}; - wil::unique_fd ListenSocket{UtilListenVsockAnyPort(&SocketAddress, 1, false)}; - THROW_LAST_ERROR_IF(!ListenSocket); - - sendResult(SocketAddress.svm_port); - - // Always return the execution result, since the service expects it - int execResult = -1; - auto sendExecResult = wil::scope_exit([&]() { sendResult(execResult); }); - - const char* Path = wsl::shared::string::FromSpan(Buffer, Message->PathIndex); - const char* Arguments = wsl::shared::string::FromSpan(Buffer, Message->CommandLineIndex); - - // Note: this makes the assumption that no empty arguments are in the message - std::vector ArgumentArray; - while (*Arguments != '\0') - { - ArgumentArray.emplace_back(Arguments); - Arguments += strlen(Arguments) + 1; - } - ArgumentArray.emplace_back(nullptr); - - auto ControlPipe = wil::unique_pipe::create(O_CLOEXEC); - - const int ChildPid = UtilCreateChildProcess("CreateChildProcess", [&]() { - try - { - wil::unique_fd ProcessSocket{UtilAcceptVsock(ListenSocket.get(), SocketAddress, SESSION_LEADER_ACCEPT_TIMEOUT_MS)}; - THROW_LAST_ERROR_IF(!ProcessSocket); - - THROW_LAST_ERROR_IF(dup2(ProcessSocket.get(), STDIN_FILENO) < 0); - THROW_LAST_ERROR_IF(dup2(ProcessSocket.get(), STDOUT_FILENO) < 0); - execv(Path, (char* const*)(ArgumentArray.data())); - - // If this point is reached, an error needs to be reported back since execv() failed. - THROW_LAST_ERROR(); - } - catch (...) - { - auto error = wil::ResultFromCaughtException(); - LOG_ERROR("Command execution failed: {}", errno); - - if (write(ControlPipe.write().get(), &error, sizeof(error)) != sizeof(error)) - { - LOG_ERROR("Failed to write command execution status: {}", errno); - } - } - }); - - THROW_LAST_ERROR_IF(ChildPid < 0); - ControlPipe.write().reset(); - - int ReadResult = TEMP_FAILURE_RETRY(read(ControlPipe.read().get(), &execResult, sizeof(execResult))); - THROW_LAST_ERROR_IF(ReadResult < 0); - - // If the pipe closed without data, then exec() was successful - if (ReadResult == 0) - { - execResult = 0; - } - else if (execResult == sizeof(execResult)) - { - // Otherwise, return the error code to the service - execResult = abs(execResult); - } - else - { - execResult = EINVAL; - } - - return 0; -} - -#define RECLAIM_PATH CGROUP_MOUNTPOINT "/memory.reclaim" - -static long long int GetUserCpuTime() - -/*++ - -Routine Description: - - This routine parses /proc/stat to query a summary of all user CPU time. - -Arguments: - - None. - -Return Value: - - The current user CPU counter for all cores. - ---*/ - -{ - wil::unique_fd Fd{open("/proc/stat", O_RDONLY)}; - if (!Fd) - { - LOG_ERROR("open failed {}", errno); - return -1; - } - - char Buffer[32]; - int Result = TEMP_FAILURE_RETRY(read(Fd.get(), Buffer, (sizeof(Buffer) - 1))); - if (Result <= 0) - { - LOG_ERROR("read failed {}", errno); - return -1; - } - - // - // Parse the first line of /proc/stat which is in the format - // "cpu ". - // - - Buffer[Result] = '\0'; - char* Sp1; - char* Info = strtok_r(Buffer, " \n", &Sp1); - if (Info == nullptr) - { - LOG_ERROR("/proc/stat first line missing cpu label"); - return -1; - } - - Info = strtok_r(nullptr, " \n", &Sp1); - if (Info == nullptr) - { - LOG_ERROR("/proc/stat first line missing cpu counter"); - return -1; - } - - return strtoll(Info, nullptr, 10); -} - -static ssize_t GetMemoryInUse() - -/*++ - -Routine Description: - - This routine returns the amount memory in use in bytes. - -Arguments: - - None. - -Return Value: - - Total memory - Free memory. Includes that used by cache and buffers. - ---*/ - -try -{ - struct sysinfo Info = {}; - THROW_LAST_ERROR_IF(sysinfo(&Info) < 0); - return (Info.totalram - Info.freeram) * Info.mem_unit; -} -CATCH_RETURN_ERRNO() - -void StartMemoryReductionThread(LX_MINI_INIT_MEMORY_RECLAIM_MODE Mode) - -/*++ - -Routine Description: - - This routine starts a background thread that performs memory compaction and optional cache/memory - reclaim when the VM is idle. This ensures that the maximum number of pages can be discarded to the host. - -Arguments: - - Mode - Supplies the memory reclaim mode. - -Return Value: - - None. - ---*/ - -try -{ - std::thread([Mode = Mode]() mutable { - try - { - // - // Set the thread's scheduling policy to idle. - // - - sched_param Parameter{}; - Parameter.sched_priority = 0; - const int Result = pthread_setschedparam(pthread_self(), SCHED_IDLE, &Parameter); - THROW_ERRNO_IF(Result, Result != 0); - - // - // Periodically check if the machine is idle by querying procfs for CPU usage. - // Memory compaction will occur if both of the following conditions are true: - // 1. The CPU time since the last check is greater than the idle threshold. - // 2. The current CPU usage is below the idle threshold. This is measured by taking two readings one second apart. - // - - double MemoryLow = 1024 * 1024 * 1024; - double MemoryHigh = 1.1 * 1024.0 * 1024.0 * 1024.0; - const int IdleThreshold = get_nprocs(); // Change math to adjust if sysconf(_SC_CLK_TCK) != 100? Is 1% - long long int Start, Stop = 0; - auto constexpr SleepDuration = std::chrono::seconds(30); - size_t ReclaimIndex = 0; - long long int const ReclaimThreshold = (get_nprocs() * sysconf(_SC_CLK_TCK) * SleepDuration / std::chrono::seconds(1)) / 200; // 0.5% - long long int ReclaimWindow[20] = {}; // 10 minutes - long long int ReclaimWindowLength = COUNT_OF(ReclaimWindow); - bool ReclaimIdling = false; - - // - // Fall back to drop cache if the required cgroup path is not present. - // - - if (Mode == LxMiniInitMemoryReclaimModeGradual && access(RECLAIM_PATH, W_OK) < 0) - { - LOG_WARNING("access({}, W_OK) failed {}, falling back to drop_caches", RECLAIM_PATH, errno); - Mode = LxMiniInitMemoryReclaimModeDropCache; - } - - if (Mode == LxMiniInitMemoryReclaimModeGradual) - { - static_assert(COUNT_OF(ReclaimWindow) >= 6); - ReclaimWindowLength = 6; // Set to 3 minutes. - } - - for (auto i = 1; i < ReclaimWindowLength; i++) - { - ReclaimWindow[i] = LLONG_MIN; - } - - std::this_thread::sleep_for(SleepDuration); - for (;;) - { - auto const Target = std::chrono::steady_clock::now() + SleepDuration; - Start = GetUserCpuTime(); - THROW_LAST_ERROR_IF(Start == -1); - - if (Mode != LxMiniInitMemoryReclaimModeDisabled) - { - // - // Ensure that utilization is below 0.5% from the last 30 seconds, and last n minutes, of usage. - // - - size_t const LastIndex = (ReclaimIndex + 1) % ReclaimWindowLength; - if ((ReclaimWindow[LastIndex] > Start - ReclaimThreshold * (ReclaimWindowLength + 1)) && - (ReclaimWindow[ReclaimIndex] > Start - ReclaimThreshold)) - { - if (Mode == LxMiniInitMemoryReclaimModeGradual) - { - double MemorySize = GetMemoryInUse(); - THROW_LAST_ERROR_IF(MemorySize < 0); - - if (MemorySize > MemoryHigh) - { - ReclaimIdling = false; - } - - if (!ReclaimIdling && MemorySize > MemoryLow) - { - double MemoryTargetSize = MemorySize * 0.97; - std::string MemoryToFree = std::to_string(size_t(MemorySize - MemoryTargetSize)); - // EAGAIN Means that it attempted, but was unable to evict sufficient pages. - THROW_LAST_ERROR_IF(WriteToFile(RECLAIM_PATH, MemoryToFree.c_str()) < 0 && errno != EAGAIN); - - if (MemoryTargetSize < MemoryLow) - { - ReclaimIdling = true; - } - } - } - else if (!ReclaimIdling) - { - ReclaimIdling = true; - THROW_LAST_ERROR_IF(WriteToFile("/proc/sys/vm/drop_caches", "1\n") < 0); - } - } - else - { - ReclaimIdling = false; - } - - ReclaimIndex = LastIndex; - ReclaimWindow[ReclaimIndex] = Start; - } - - // - // Perform memory compaction if the VM is idle. - // This coalesces free pages into larger blocks for more efficient page reporting. - // - - if ((Start - Stop) > IdleThreshold) - { - std::this_thread::sleep_for(std::chrono::seconds(1)); - Stop = GetUserCpuTime(); - THROW_LAST_ERROR_IF(Stop == -1); - if ((Stop - Start) < IdleThreshold) - { - THROW_LAST_ERROR_IF(WriteToFile("/proc/sys/vm/compact_memory", "1\n") < 0); - } - } - - std::this_thread::sleep_until(Target); - } - } - CATCH_LOG() - }).detach(); -} -CATCH_LOG() +/*++ + +Copyright (c) Microsoft. All rights reserved. + +Module Name: + + util.c + +Abstract: + + This file utility function definitions. + +--*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "common.h" +#include "wslpath.h" +#include "util.h" +#include "drvfs.h" +#include "escape.h" +#include "config.h" +#include "mountutilcpp.h" +#include "message.h" +#include "RuntimeErrorWithSourceLocation.h" +#include "SocketChannel.h" +#include "Localization.h" + +#define INITIAL_MESSAGE_BUFFER_SIZE (0x1000) + +#define PLAN9_RDR_PREFIX "\\\\wsl.localhost\\" +#define PLAN9_RDR_COMPAT_PREFIX "\\\\wsl$\\" + +#define WSLENV_ENV "WSLENV" + +#define WSL_CGROUPS_FIELD_ENABLED (3) +#define WSL_CGROUPS_FIELD_MAX WSL_CGROUPS_FIELD_ENABLED +#define WSL_CGROUPS_FIELD_SEP '\t' +#define WSL_CGROUPS_FIELD_SUBSYSTEM (0) + +#define WSL_MOUNT_OPTION_SEP ',' + +int g_IsVmMode = -1; +static std::optional g_CachedFeatureFlags; +static sigset_t g_originalSignals; +thread_local std::string g_threadName; + +namespace wil { + +thread_local std::optional ScopedWarningsCollector::g_collectedWarnings; + +} + +int InteropServer::Create() + +/*++ + +Routine Description: + + This routine creates an interop server unix socket and starts listening on it. + +Arguments: + + None. + +Return Value: + + 0 on success, -1 on failure. + +--*/ + +{ + if (!m_InteropSocketPath.empty()) + { + LOG_ERROR("Interop server already created"); + return -1; + } + + // + // Generate a unique name to be used for the interop socket path. + // + + m_InteropSocketPath = std::format(WSL_INTEROP_SOCKET_FORMAT, WSL_TEMP_FOLDER, getpid(), WSL_INTEROP_SOCKET); + + // + // Ensure the WSL temp folder exists and has the correct mode. + // + + if (UtilMkdir(WSL_TEMP_FOLDER, WSL_TEMP_FOLDER_MODE) < 0) + { + return -1; + } + + // + // Create a unix socket to handle interop requests. + // + // N.B. This is done before the child process is created to ensure that + // the socket is ready for connections. + // + + m_InteropSocket.reset(socket(AF_UNIX, (SOCK_STREAM | SOCK_CLOEXEC), 0)); + if (!m_InteropSocket) + { + LOG_ERROR("socket failed {}", errno); + return -1; + } + + sockaddr_un InteropSocketAddress{}; + InteropSocketAddress.sun_family = AF_UNIX; + strncpy(InteropSocketAddress.sun_path, m_InteropSocketPath.c_str(), (sizeof(InteropSocketAddress.sun_path) - 1)); + + auto Result = bind(m_InteropSocket.get(), reinterpret_cast(&InteropSocketAddress), sizeof(InteropSocketAddress)); + if (Result < 0) + { + LOG_ERROR("bind failed {}", errno); + return -1; + } + + Result = listen(m_InteropSocket.get(), -1); + if (Result < 0) + { + LOG_ERROR("listen failed {}", errno); + return -1; + } + + // + // Ensure that any users can connect to the interop socket. + // + + Result = chmod(m_InteropSocketPath.c_str(), 0777); + if (Result < 0) + { + LOG_ERROR("chmod failed {}", errno); + return -1; + } + + return 0; +} + +wil::unique_fd InteropServer::Accept() const + +/*++ + +Routine Description: + + This routine accepts a connection on the interop server. + +Arguments: + + None. + +Return Value: + + The socket. + +--*/ + +{ + wil::unique_fd InteropConnection{accept4(m_InteropSocket.get(), nullptr, nullptr, SOCK_CLOEXEC)}; + if (!InteropConnection) + { + LOG_ERROR("accept4 failed {}", errno); + return {}; + } + + timeval Timeout{}; + Timeout.tv_sec = INTEROP_TIMEOUT_SEC; + if (setsockopt(InteropConnection.get(), SOL_SOCKET, SO_RCVTIMEO, &Timeout, sizeof(Timeout)) < 0) + { + LOG_ERROR("setsockopt(SO_RCVTIMEO) failed {}", errno); + } + + return InteropConnection; +} + +void InteropServer::Reset() +{ + if (!m_InteropSocketPath.empty()) + { + unlink(m_InteropSocketPath.c_str()); + m_InteropSocketPath = {}; + } +} + +InteropServer::~InteropServer() +{ + Reset(); +} + +int UtilAcceptVsock(int SocketFd, sockaddr_vm SocketAddress, int Timeout, int SocketFlags) + +/*++ + +Routine Description: + + This routine accepts a socket connection. + +Arguments: + + SocketFd - Supplies a socket file descriptor. + + SocketAddress - Supplies the socket address. This is passed by value instead + of by reference because accept4 modifies the structure to contain the + address of the peer socket. + + Timeout - Supplies a timeout. + + SocketFlags - Supplies the socket flags. + +Return Value: + + A file descriptor representing the socket, -1 on failure. + +--*/ + +{ + // + // If a timeout was specified, use a pollfd to wait for the accept. + // + + int Result = 0; + if (Timeout == -1) + { + pollfd PollDescriptor{SocketFd, POLLIN, 0}; + + while (true) + { + Result = poll(&PollDescriptor, 1, 60 * 1000); + if (Result < 0) + { + LOG_ERROR("poll({}) failed, {}", SocketFd, errno); + return Result; + } + else if ((Result == 0) || ((PollDescriptor.revents & POLLIN) == 0)) + { + LOG_ERROR("Waiting for abnormally long accept({})", SocketFd); + } + else + { + break; + } + } + } + else + { + pollfd PollDescriptor{SocketFd, POLLIN, 0}; + Result = poll(&PollDescriptor, 1, Timeout); + if ((Result <= 0) || ((PollDescriptor.revents & POLLIN) == 0)) + { + errno = ETIMEDOUT; + Result = -1; + } + } + + if (Result != -1) + { + socklen_t SocketAddressSize = sizeof(SocketAddress); + Result = accept4(SocketFd, reinterpret_cast(&SocketAddress), &SocketAddressSize, SocketFlags); + } + + if (Result < 0) + { + LOG_ERROR("accept4 failed {}", errno); + } + + return Result; +} + +int UtilBindVsockAnyPort(struct sockaddr_vm* SocketAddress, int Type) + +/*++ + +Routine Description: + + This routine creates a bound vsock socket an available port. + +Arguments: + + SocketAddress - Supplies a buffer to receive the socket address of the + socket. + + Type - Supplies the socket type. + +Return Value: + + A file descriptor representing the bound socket, -1 on failure. + +--*/ + +{ + int Result; + socklen_t SocketAddressSize; + int SocketFd; + + SocketFd = socket(AF_VSOCK, Type, 0); + if (SocketFd < 0) + { + Result = -1; + LOG_ERROR("socket failed {}", errno); + goto BindVsockAnyPortExit; + } + + memset(SocketAddress, 0, sizeof(*SocketAddress)); + SocketAddress->svm_family = AF_VSOCK; + SocketAddress->svm_cid = VMADDR_CID_ANY; + SocketAddress->svm_port = VMADDR_PORT_ANY; + SocketAddressSize = sizeof(*SocketAddress); + Result = bind(SocketFd, (const struct sockaddr*)SocketAddress, SocketAddressSize); + + if (Result < 0) + { + LOG_ERROR("bind failed {}", errno); + goto BindVsockAnyPortExit; + } + + // + // Query the socket name to get the assigned port. + // + + Result = getsockname(SocketFd, (struct sockaddr*)SocketAddress, &SocketAddressSize); + + if (Result < 0) + { + LOG_ERROR("getsockname failed {}", errno); + goto BindVsockAnyPortExit; + } + + Result = SocketFd; + SocketFd = -1; + +BindVsockAnyPortExit: + if (SocketFd != -1) + { + CLOSE(SocketFd); + } + + return Result; +} + +size_t UtilCanonicalisePathSeparator(char* Path, char Separator) + +/*++ + +Routine Description: + + This routine ensures all separators in Path use the specified separator. + +Arguments: + + Path - Supplies the path to canonicalise. + + Separator - Supplies the separator character to be used. + +Return Value: + + The size of the new string. + +--*/ + +{ + size_t DestIndex; + size_t PathLength; + size_t SourceIndex; + + DestIndex = 0; + SourceIndex = 0; + PathLength = strlen(Path); + + // + // Iterate through the path, replacing all separators. + // + + for (; SourceIndex < PathLength; SourceIndex++) + { + if (Path[SourceIndex] == PATH_SEP || Path[SourceIndex] == PATH_SEP_NT) + { + // + // Don't add a separator if previous char already is a separator. + // Also handle the special case where 'Path' is a UNC path (\\X or //X) + // where both separators should be kept. + // + + if (DestIndex > 1 && Path[DestIndex - 1] == Separator) + { + continue; + } + + Path[DestIndex] = Separator; + } + else + { + Path[DestIndex] = Path[SourceIndex]; + } + + DestIndex++; + } + + Path[DestIndex] = '\0'; + return DestIndex; +} + +void UtilCanonicalisePathSeparator(std::string& Path, char Separator) + +/*++ + +Routine Description: + + This routine ensures all separators in Path use the specified separator. + +Arguments: + + Path - Supplies the path to canonicalise. + + Separator - Supplies the separator character to be used. + +Return Value: + + None. + +--*/ + +{ + Path.resize(UtilCanonicalisePathSeparator(Path.data(), Separator)); +} + +wil::unique_fd UtilConnectToInteropServer(std::optional Pid) + +/*++ + +Routine Description: + + This routine connects to the interop server of the current client process. + +Arguments: + + Pid - Supplies an optional process ID to connect to. + +Return Value: + + A file descriptor representing the connected socket, -1 on failure. + +--*/ + +try +{ + char* InteropSocketPath; + std::string Path; + if (Pid.has_value()) + { + Path = std::format(WSL_INTEROP_SOCKET_FORMAT, WSL_TEMP_FOLDER, Pid.value(), WSL_INTEROP_SOCKET); + InteropSocketPath = Path.data(); + } + else + { + // + // Query the interop server environment variable. If the process does not + // have the environment variable, or if the socket does not exists, search through parent process tree for an + // interop server. + // + + InteropSocketPath = getenv(WSL_INTEROP_ENV); + if (InteropSocketPath == nullptr || (access(InteropSocketPath, F_OK) < 0 && errno == ENOENT)) + { + pid_t Parent = getppid(); + while (Parent > 0) + { + Path = std::format(WSL_INTEROP_SOCKET_FORMAT, WSL_TEMP_FOLDER, Parent, WSL_INTEROP_SOCKET); + if (access(Path.c_str(), F_OK) == 0) + { + InteropSocketPath = Path.data(); + break; + } + + Parent = UtilGetPpid(Parent); + } + + if (InteropSocketPath == nullptr) + { + return {}; + } + + setenv(WSL_INTEROP_ENV, InteropSocketPath, 1); + } + } + + // + // Connect to the server and return the connected socket to the caller. + // + + return UtilConnectUnix(InteropSocketPath); +} +CATCH_RETURN_ERRNO() + +wil::unique_fd UtilConnectUnix(const char* Path) + +/*++ + +Routine Description: + + This routine connects to the specified unix socket path. + +Arguments: + + Path - Supplies the path of the unix socket. + +Return Value: + + The connected socket, or a default-initialized value on failure. + +--*/ + +{ + wil::unique_fd Socket{socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0)}; + if (!Socket) + { + LOG_ERROR("socket failed {}", errno); + return {}; + } + + sockaddr_un SocketAddress{}; + SocketAddress.sun_family = AF_UNIX; + strncpy(SocketAddress.sun_path, Path, sizeof(SocketAddress.sun_path) - 1); + if (connect(Socket.get(), reinterpret_cast(&SocketAddress), sizeof(SocketAddress)) < 0) + { + LOG_ERROR("connect failed {}", errno); + return {}; + } + + return Socket; +} + +wil::unique_fd UtilConnectVsock(unsigned int Port, bool CloseOnExec, std::optional SocketBuffer, const std::source_location& Source) noexcept + +/*++ + +Routine Description: + + This routine connects to a vsock with the specified port. + +Arguments: + + Port - Supplies the port to connect to. + + CloseOnExec - Supplies a boolean specifying if the socket file descriptor should be closed on exec. + + SocketBuffer - Optionally supplies the size to use for the socket send and receive buffers. + + Source - Supplies the caller location. + +Return Value: + + A file descriptor representing the connected socket, -1 on failure. + +--*/ + +{ + int Type = SOCK_STREAM; + WI_SetFlagIf(Type, SOCK_CLOEXEC, CloseOnExec); + wil::unique_fd SocketFd{socket(AF_VSOCK, Type, 0)}; + if (!SocketFd) + { + LOG_ERROR("socket failed {} (from: {})", errno, Source); + return {}; + } + + // + // Set the socket connect timeout. + // + + timeval Timeout{}; + Timeout.tv_sec = LX_INIT_HVSOCKET_TIMEOUT_SECONDS; + if (setsockopt(SocketFd.get(), AF_VSOCK, SO_VM_SOCKETS_CONNECT_TIMEOUT, &Timeout, sizeof(Timeout)) < 0) + { + LOG_ERROR("setsockopt SO_VM_SOCKETS_CONNECT_TIMEOUT failed {}, (from: {})", errno, Source); + return {}; + } + + if (SocketBuffer) + { + int BufferSize = *SocketBuffer; + if (setsockopt(SocketFd.get(), SOL_SOCKET, SO_SNDBUF, &BufferSize, sizeof(BufferSize)) < 0) + { + LOG_ERROR("setsockopt(SO_SNDBUF, {}) failed {}, (from: {})", BufferSize, errno, Source); + return {}; + } + + if (setsockopt(SocketFd.get(), SOL_SOCKET, SO_RCVBUF, &BufferSize, sizeof(BufferSize)) < 0) + { + LOG_ERROR("setsockopt(SO_RCVBUF, {}) failed {}, (from: {})", BufferSize, errno, Source); + return {}; + } + } + + sockaddr_vm SocketAddress{}; + SocketAddress.svm_family = AF_VSOCK; + SocketAddress.svm_cid = VMADDR_CID_HOST; + SocketAddress.svm_port = Port; + if (connect(SocketFd.get(), (const struct sockaddr*)&SocketAddress, sizeof(SocketAddress)) < 0) + { + LOG_ERROR("connect port {} failed {} (from: {})", Port, errno, Source); + return {}; + } + + return SocketFd; +} + +int UtilCreateProcessAndWait(const char* const File, const char* const Argv[], int* Status, const std::map& Env, bool DetachTerminal) + +/*++ + +Routine Description: + + This routine creates a helper process from init and waits for it to exit. + +Arguments: + + File - Supplies the file name to execute. + + Argv - Supplies the arguments for the command. + + Status - Supplies an optional pointer that receives the exit status of the + process. + + DetachTerminal - Supplies a boolean that, when true, calls setsid() in the + child process to detach it from the controlling terminal. + +Return Value: + + 0 on success, -1 on failure. + +--*/ +{ + pid_t ChildPid; + int Result; + int LocalStatus; + pid_t WaitResult; + + Result = -1; + + // + // Init needs to not ignore SIGCHLD so it can wait for this child. + // + + auto restore = signal(SIGCHLD, SIG_DFL); + + ChildPid = fork(); + if (ChildPid < 0) + { + LOG_ERROR("Forking child process for {} failed with {}", File, errno); + goto CreateProcessAndWaitEnd; + } + + if (ChildPid == 0) + { + // + // Restore default signal dispositions for the child process. + // + + if (UtilSetSignalHandlers(g_SavedSignalActions, false) < 0 || UtilRestoreBlockedSignals() < 0) + { + _exit(-1); + } + + // + // Set environment variables. + // + + for (const auto& e : Env) + { + setenv(e.first.c_str(), e.second.c_str(), 1); + } + + // + // Detach from the controlling terminal if requested. + // + + if (DetachTerminal) + { + if (setsid() == -1) + { + LOG_ERROR("setsid failed {}", errno); + _exit(-1); + } + } + + // + // Invoke the executable. + // + + // This explicit cast is okay for now because: + // 1. execv function is guaranteed to not alter the arguments + // 2. In sometime we probably will replace most of these string constants + // with std::string anyway. + execv(File, const_cast(Argv)); + LOG_ERROR("execv({}) failed with {}", File, errno); + _exit(-1); + } + + if (Status == nullptr) + { + Status = &LocalStatus; + } + + // + // TODO_LX: Do we need a timeout when waiting for the process? + // + + WaitResult = waitpid(ChildPid, Status, 0); + if (WaitResult < 0) + { + LOG_ERROR("Waiting for {} failed with {}", File, errno); + goto CreateProcessAndWaitEnd; + } + + if (*Status != 0) + { + LOG_ERROR("{} failed with status {:#x}", File, *Status); + goto CreateProcessAndWaitEnd; + } + + Result = 0; + +CreateProcessAndWaitEnd: + + // + // Restore the disposition of SIGCHLD. + // + + signal(SIGCHLD, restore); + + return Result; +} + +int UtilExecCommandLine(const char* CommandLine, std::string* Output, int ExpectedStatus, bool PrintError) + +/*++ + +Routine Description: + + This routine runs the command and optionally returns the output. + +Arguments: + + CommandLine - Supplies the command line of the process to launch. + + Output - Supplies an optional pointer to a std::string to receive the output of the command. + If no buffer is provided the output will appear in stdout. + + ExpectedStatus - Supplies the expected return status of the command. + + PrintError - Supplies a boolean that specifies if an error should be printed if the process does not return the expected status. + +Return Value: + + 0 on success, -1 on failure. + +--*/ + +{ + // + // Exec the command and read the output. + // + + wil::unique_file Pipe{popen(CommandLine, "re")}; + if (!Pipe) + { + LOG_ERROR("popen({}) failed {}", CommandLine, errno); + return -1; + } + + std::vector Buffer(1024); + int Result = -1; + while (fgets(Buffer.data(), Buffer.size(), Pipe.get()) != nullptr) + { + if (Output) + { + (*Output) += Buffer.data(); + } + else + { + fputs(Buffer.data(), stdout); + } + } + + if (ferror(Pipe.get())) + { + Result = -1; + LOG_ERROR("fgets failed {}", errno); + goto ErrorExit; + } + + Result = 0; + +ErrorExit: + if (Pipe) + { + Result = pclose(Pipe.release()); + if (Result == -1) + { + LOG_ERROR("pclose failed {}", errno); + } + else + { + Result = UtilProcessChildExitCode(Result, CommandLine, ExpectedStatus, PrintError); + } + } + + return Result; +} + +std::string UtilFindMount(const char* MountInfoFile, const char* Path, bool WinPath, size_t* PrefixLength) + +/*++ + +Routine Description: + + This routine parses the /proc/self/mountinfo file to find a mount that + matches the specified path. + + N.B. The caller is responsible for freeing the returned replacement prefix + buffer. + +Arguments: + + MountInfoFile - Supplies the path to the mountinfo file. + + Path - Supplies the path. + + WinPath - Supplies a value that indicates whether the path is a Windows + path. + + PrefixLength - Supplies a pointer which receives the length of the prefix + that should be stripped from the path. + +Return Value: + + The replacement prefix on success, or an empty string on failure. + +--*/ + +try +{ + char** MatchField; + char** ReplacementField; + + mountutil::MountEnum MountEnum{MountInfoFile}; + if (WinPath != false) + { + MatchField = &MountEnum.Current().Source; + ReplacementField = &MountEnum.Current().MountPoint; + } + else + { + MatchField = &MountEnum.Current().MountPoint; + ReplacementField = &MountEnum.Current().Source; + } + + std::string FoundReplacement; + size_t FoundPrefixLength = 0; + while (MountEnum.Next()) + { + // + // If a mount point was previously found, and this mount point is a + // prefix of the path (or the previously found mount point, for Windows + // to Linux translation), it means that the path is not actually on + // the previously found mount, so discard that result. + // + // For example: + // - When translating /mnt/c/foo/bar, first /mnt/c is found, but a + // later entry indicates /mnt/c/foo is also a mount point (e.g. using + // tmpfs). This means /mnt/c/foo/bar is not on the /mnt/c mount. + // - When translating C:\foo, first /mnt/c is found. A later entry + // indicates /mnt itself is a mount point, making the earlier /mnt/c + // mount unreachable. + // + // TODO_LX: This doesn't catch the case when translating C:\foo\bar and + // /mnt/c/foo is a mount point. Handling that is more complicated. + // + + if (!FoundReplacement.empty()) + { + const char* LinuxPath = WinPath ? FoundReplacement.c_str() : Path; + size_t LinuxPrefixLength = UtilIsPathPrefix(LinuxPath, MountEnum.Current().MountPoint, false); + if (LinuxPrefixLength > 0) + { + FoundReplacement.resize(0); + } + } + + // + // For Plan 9, parse the actual mount source from the superblock options. + // For virtiofs, parse the mount source from source (for example drvfsC or drvfsaC). + // If the file system isn't Plan 9, virtiofs, or DrvFs, skip this mount. + // + + std::string MountSource; + if (strcmp(MountEnum.Current().FileSystemType, PLAN9_FS_TYPE) == 0) + { + MountSource = UtilParsePlan9MountSource(MountEnum.Current().SuperOptions); + if (MountSource.empty()) + { + continue; + } + + MountEnum.Current().Source = MountSource.data(); + } + else if (strcmp(MountEnum.Current().FileSystemType, VIRTIO_FS_TYPE) == 0) + { + MountSource = QueryVirtiofsMountSource(MountEnum.Current().Source); + if (MountSource.empty()) + { + continue; + } + + MountEnum.Current().Source = MountSource.data(); + } + else if (strcmp(MountEnum.Current().FileSystemType, DRVFS_FS_TYPE) == 0) + { + // + // The mount source is a Windows path and may use forward slashes; + // flip them to backslashes. + // + + UtilCanonicalisePathSeparator(MountEnum.Current().Source, PATH_SEP_NT); + } + else + { + continue; + } + + // + // Strip the trailing backslash if present. + // + + size_t Length = strlen(MountEnum.Current().Source); + if ((Length > 0) && (MountEnum.Current().Source[Length - 1] == PATH_SEP_NT)) + { + MountEnum.Current().Source[Length - 1] = '\0'; + } + + // + // For bind mounts, use the concatenation of the mount source and root + // of the mount as the mount source string. + // + + std::string CombinedMountSource; + if (strcmp(MountEnum.Current().Root, "/") != 0) + { + CombinedMountSource += MountEnum.Current().Source; + CombinedMountSource += MountEnum.Current().Root; + UtilCanonicalisePathSeparator(CombinedMountSource, PATH_SEP_NT); + MountEnum.Current().Source = CombinedMountSource.data(); + } + + // + // Check if the match field is a prefix of the path. + // + // N.B. For Windows paths, only matches longer than the existing match + // are considered. This is because Windows mounts aren't + // guaranteed to be in order and NTFS directory mounts should be + // preferred over plain drive letter mounts if they match. + // + + Length = UtilIsPathPrefix(Path, *MatchField, WinPath); + if ((Length == 0) || ((WinPath != false) && (Length < FoundPrefixLength))) + { + continue; + } + + // + // Store the length of the prefix so the caller can strip it from the + // string. + // + + FoundPrefixLength = Length; + + // + // Store the replacement. + // + + FoundReplacement = *ReplacementField; + + // + // Continue searching the file even if a mount has been found, since + // newer mounts could shadow this one or be a nested mount. + // + } + + if (!FoundReplacement.empty() && PrefixLength != nullptr) + { + *PrefixLength = FoundPrefixLength; + } + + return FoundReplacement; +} +catch (...) +{ + LOG_CAUGHT_EXCEPTION(); + return {}; +} + +std::optional UtilGetEnv(const char* Name, char* Environment) + +/*++ + +Routine Description: + + This queries the specified environment variable. + +Arguments: + + Name - Supplies the name to query. + + Environment - Supplies an environment block to search. If NULL is provided + the environment of the calling process is used. + +Return Value: + + The value of the specified environment variable, NULL if there is no match. + +--*/ + +{ + char* Current; + size_t Length; + size_t NameLength; + std::optional Value; + + if (Environment == nullptr) + { + const auto* EnvValue = getenv(Name); + if (EnvValue != nullptr) + { + Value = std::string{EnvValue}; + } + } + else + { + NameLength = strlen(Name); + for (size_t Index = 0;;) + { + Current = Environment + Index; + Length = strlen(Current); + if (Length == 0) + { + break; + } + + if ((strncmp(Current, Name, NameLength) == 0) && (Current[NameLength] == '=')) + { + Value = std::string{&Current[NameLength + 1]}; + break; + } + + Index += Length + 1; + } + } + + return Value; +} + +std::string UtilGetEnvironmentVariable(const char* Name) + +/*++ + +Routine Description: + + This queries the specified environment variable. If the value does not exist it gets the value from + the WSL interop server. + +Arguments: + + Name - Supplies the name to query. + +Return Value: + + The value of the specified environment variable if there is a match. + +--*/ + +try +{ + // + // Try to get the environment variable value. If it is not set, query the interop server for value. + // + + std::vector Buffer; + auto Value = getenv(Name); + if (Value == nullptr) + { + wsl::shared::SocketChannel channel{UtilConnectToInteropServer(), "InteropClient"}; + if (channel.Socket() < 0) + { + return {}; + } + + wsl::shared::MessageWriter Message(LxInitMessageQueryEnvironmentVariable); + Message.WriteString(Name); + + auto transaction = channel.StartTransaction(); + transaction.Send(Message.Span()); + + // + // Read a response, this will contain the environment variable value if it exists. + // + + Value = transaction.Receive().Buffer; + + // + // Set the environment variable for future queries. + // + + if (setenv(Name, Value, 1) < 0) + { + LOG_ERROR("setenv({}, {}, 1) failed {}", Name, Value, errno); + } + } + + return Value; +} +catch (...) +{ + LOG_CAUGHT_EXCEPTION(); + return {}; +} + +int UtilGetFeatureFlags() + +/*++ + +Routine Description: + + This routine gets the feature flags, either directly, from an environment + variable, or by querying it from the init process. + +Arguments: + + None. + +Return Value: + + The feature flags. + +--*/ + +{ + // + // If feature flags are already known, return them. + // + + if (g_CachedFeatureFlags) + { + return *g_CachedFeatureFlags; + } + + // + // Check if the environment variable is present. + // + // N.B. This is used for processes launched directly from init (e.g. + // mount.drvfs during initial configuration), because they may not be + // able to connect to init. + // + + int FeatureFlags = LxInitFeatureNone; + const char* FeatureFlagEnv = getenv(WSL_FEATURE_FLAGS_ENV); + if (FeatureFlagEnv != nullptr) + { + FeatureFlags = strtol(FeatureFlagEnv, nullptr, 16); + } + else + { + // + // Query init for the value. If an error occurs, just return no features. + // + + wsl::shared::SocketChannel channel{UtilConnectUnix(WSL_INIT_INTEROP_SOCKET), "wslinfo"}; + if (channel.Socket() < 0) + { + return FeatureFlags; + } + + MESSAGE_HEADER Message; + Message.MessageType = LxInitMessageQueryFeatureFlags; + Message.MessageSize = sizeof(Message); + + auto transaction = channel.StartTransaction(); + transaction.Send(Message); + FeatureFlags = transaction.Receive>().Result; + } + + UtilSetFeatureFlags(FeatureFlags, FeatureFlagEnv == nullptr); + return FeatureFlags; +} + +void UtilSetFeatureFlags(int FeatureFlags, bool UpdateEnv) + +/*++ + +Routine Description: + + This routine sets the feature flags and updates the cached value and environment variable. + +Arguments: + + FeatureFlags - Supplies the feature flags to set. + + UpdateEnv - Supplies a boolean that indicates whether the environment variable should be updated. + +Return Value: + + None. + +--*/ + +try +{ + g_CachedFeatureFlags = FeatureFlags; + if (UpdateEnv) + { + auto FeatureFlagsString = std::format("{:x}", FeatureFlags); + if (setenv(WSL_FEATURE_FLAGS_ENV, FeatureFlagsString.c_str(), 1) < 0) + { + LOG_ERROR("setenv({}, {}, 1) failed {}", WSL_FEATURE_FLAGS_ENV, FeatureFlagsString, errno); + } + } +} +CATCH_LOG() + +std::optional UtilGetNetworkingMode(void) + +/*++ + +Routine Description: + + This routine queries the networking mode from the init process. + +Arguments: + + None. + +Return Value: + + The networking mode if successful, std::nullopt otherwise. + +--*/ + +try +{ + wsl::shared::SocketChannel channel{UtilConnectUnix(WSL_INIT_INTEROP_SOCKET), "wslinfo"}; + THROW_LAST_ERROR_IF(channel.Socket() < 0); + + MESSAGE_HEADER Message; + Message.MessageType = LxInitMessageQueryNetworkingMode; + Message.MessageSize = sizeof(Message); + + auto transaction = channel.StartTransaction(); + transaction.Send(Message); + + const auto& response = transaction.Receive>(); + auto NetworkingMode = static_cast(response.Result); + + THROW_ERRNO_IF(EINVAL, NetworkingMode < LxMiniInitNetworkingModeNone || NetworkingMode > LxMiniInitNetworkingModeVirtioProxy); + + return NetworkingMode; +} +catch (...) +{ + LOG_CAUGHT_EXCEPTION(); + return {}; +} + +pid_t UtilGetPpid(pid_t Pid) + +/*++ + +Routine Description: + + This routine returns the parent process id of the specified process. + +Arguments: + + Pid - Supplies the process id to get the parent of. + +Return Value: + + The parent process id if successful, -1 otherwise. + +--*/ + +{ + // + // Open the /proc/[pid]/stat file. + // + + const auto FilePath = std::format("/proc/{}/stat", Pid); + std::ifstream File(FilePath); + + std::string Line; + if (!File || !std::getline(File, Line)) + { + return -1; + } + + // + // Parse the file. Sample format: "86 (bash) S 9". + // N.B. The second entry can contain a space so we can't just use strtok. + // + + const std::regex Pattern("^[0-9]+ \\(.*\\) \\w ([0-9]+).*"); + std::smatch Match; + if (!std::regex_match(Line, Match, Pattern) || Match.size() != 2) + { + LOG_ERROR("Failed to parse: {}, content: {}", FilePath, Line); + return -1; + } + + auto Result = strtol(Match.str(1).c_str(), nullptr, 10); + if (Result == 0) + { + LOG_ERROR("Failed to parse: {}, content: {}", FilePath, Line); + return -1; + } + + return Result; +} + +std::string UtilGetVmId(void) + +/*++ + +Routine Description: + + This routine queries the VM ID from the init process. + +Arguments: + + None. + +Return Value: + + The VM ID if successful, an empty string otherwise. + +--*/ + +try +{ + wsl::shared::SocketChannel channel{UtilConnectUnix(WSL_INIT_INTEROP_SOCKET), "wslinfo"}; + THROW_LAST_ERROR_IF(channel.Socket() < 0); + + wsl::shared::MessageWriter Message(LxInitMessageQueryVmId); + auto transaction = channel.StartTransaction(); + transaction.Send(Message.Span()); + + return transaction.Receive().Buffer; +} +catch (...) +{ + LOG_CAUGHT_EXCEPTION(); + return {}; +} + +void UtilInitGroups(const char* User, gid_t Gid) + +/*++ + +Routine Description: + + This routine initializes the groups for the current process. + N.B. This is needed because the musl version of initgroups has a hard-coded 32 group max. + +Arguments: + + User - Supplies the user name. + + Gid - Supplies the group id. + +Return Value: + + None. + +--*/ + +{ + if (initgroups(User, Gid) < 0) + { + int Count{}; + getgrouplist(User, Gid, nullptr, &Count); + std::vector Groups(Count); + THROW_LAST_ERROR_IF(getgrouplist(User, Gid, Groups.data(), &Count) < 0); + + THROW_LAST_ERROR_IF(setgroups(Count, Groups.data()) < 0); + } +} + +void UtilInitializeMessageBuffer(std::vector& Buffer) + +/*++ + +Routine Description: + + This routine ensures the supplied buffer is initialized. + +Arguments: + + Buffer - Supplies the buffer to be initialized. + +Return Value: + + None. + +--*/ + +{ + if (Buffer.size() < INITIAL_MESSAGE_BUFFER_SIZE) + { + Buffer.resize(INITIAL_MESSAGE_BUFFER_SIZE); + } +} + +bool UtilIsAbsoluteWindowsPath(const char* Path) + +/*++ + +Routine Description: + + This routine determines if the supplied path is an absolute Windows path. + +Arguments: + + Path - Supplies the path to check. + +Return Value: + + true if the supplied path is an absolute Windows path, false otherwise. + +--*/ + +{ + if ((strlen(Path) < 3) || (!(((Path[0] == PATH_SEP_NT || Path[0] == PATH_SEP) && (Path[1] == PATH_SEP_NT || Path[1] == PATH_SEP)) || + (isalpha(Path[0]) && Path[1] == DRIVE_SEP_NT)))) + { + return false; + } + + return true; +} + +size_t UtilIsPathPrefix(const char* Path, const char* Prefix, bool WinPath) + +/*++ + +Routine Description: + + This routine checks if one path is a prefix of another. + +Arguments: + + Path - Supplies the path to check for a prefix. + + Prefix - Supplies the prefix to check for. + + WinPath - Supplies a value that indicates whether Path is a Windows path. + +Return Value: + + The length of the prefix, or 0 if there is no match. + +--*/ + +{ + size_t PathLength; + size_t PrefixLength; + char Separator; + + if (WinPath != false) + { + Separator = PATH_SEP_NT; + } + else + { + Separator = PATH_SEP; + } + + // + // Check the lengths and make sure the next character is a separator. + // + + PathLength = strlen(Path); + PrefixLength = strlen(Prefix); + if ((PathLength < PrefixLength) || ((PathLength > PrefixLength) && (Path[PrefixLength] != Separator))) + { + return 0; + } + + // + // Check if the prefix matches. + // + // N.B. For Windows paths, this is done case-insensitive. + // + + if (!wsl::shared::string::StartsWith(Path, Prefix, WinPath)) + { + return 0; + } + + return PrefixLength; +} + +bool UtilIsUtilityVm(void) + +/*++ + +Routine Description: + + This routine determines if the current process is running in a Utility VM or + an WSL1 based instance. + +Arguments: + + None. + +Return Value: + + true if this instance is VM Mode, false otherwise. + +--*/ + +{ + // + // If this process has not yet checked, inspect the Linux kernel release + // string. + // + + if (g_IsVmMode == -1) + { + // + // The VM Mode kernel release contains "microsoft", the lxcore kernel + // contains "Microsoft". + // + // TODO: Come up with a different way to detect if we are running under + // lxcore versus a Utility VM. + // + + struct utsname UnameBuffer; + memset(&UnameBuffer, 0, sizeof(UnameBuffer)); + if (uname(&UnameBuffer) < 0) + { + FATAL_ERROR("uname failed {}", errno); + } + + g_IsVmMode = (strstr(UnameBuffer.release, "Microsoft") == NULL); + } + + return g_IsVmMode; +} + +int UtilListenVsockAnyPort(struct sockaddr_vm* Address, int Backlog, bool CloseOnExec) + +/*++ + +Routine Description: + + This routine creates a bound and listening vsock socket an available port. + +Arguments: + + Address - Supplies a buffer to receive the socket address of the socket. + + Backlog - Supplies the length of the backlog. + +Return Value: + + A file descriptor representing the listening socket, -1 on failure. + +--*/ + +{ + int Result; + int SocketFd; + + int flags = SOCK_STREAM; + WI_SetFlagIf(flags, SOCK_CLOEXEC, CloseOnExec); + + SocketFd = UtilBindVsockAnyPort(Address, flags); + if (SocketFd < 0) + { + Result = -1; + goto ListenVsockAnyPortExit; + } + + Result = listen(SocketFd, Backlog); + if (Result < 0) + { + LOG_ERROR("listen failed {}", errno); + goto ListenVsockAnyPortExit; + } + + Result = SocketFd; + SocketFd = -1; + +ListenVsockAnyPortExit: + if (SocketFd != -1) + { + CLOSE(SocketFd); + } + + return Result; +} + +int UtilMkdir(const char* Path, mode_t Mode) + +/*++ + +Routine Description: + + This routine ensures the directory exists. + +Arguments: + + Path - Supplies the path of the directory to create. + + Mode - Supplies the mode. + +Return Value: + + 0 on success, -1 on failure. + +--*/ + +{ + if ((mkdir(Path, Mode) < 0) && (errno != EEXIST)) + { + LOG_ERROR("mkdir({}, {:o}) failed {}", Path, Mode, errno); + return -1; + } + + return 0; +} + +int UtilMkdirPath(const char* Path, mode_t Mode, bool SkipLast) + +/*++ + +Routine Description: + + This routine ensures the directory exists. If necessary, all its parents + are created as well. + +Arguments: + + Path - Supplies the path of the directory to create. + + Mode - Supplies the mode. + + SkipLast - Indicates whether to skip creating the final entry in the path. + +Return Value: + + 0 on success, -1 on failure. + +--*/ + +{ + std::string LocalPath{Path}; + std::string::size_type Index = 0; + + // + // Because the search is always from index + 1, the first leading / is skipped. + // + + for (;;) + { + Index = LocalPath.find_first_of(PATH_SEP, Index + 1); + if (Index != std::string::npos) + { + LocalPath[Index] = '\0'; + } + else if (SkipLast) + { + break; + } + + if (UtilMkdir(LocalPath.c_str(), Mode) < 0) + { + return -1; + } + + if (Index == std::string::npos) + { + break; + } + + LocalPath[Index] = PATH_SEP; + } + + return 0; +} + +int UtilMountFile(const char* Source, const char* Destination) +try +{ + // Is the file is a symlink, delete it since that would break the mount. + if (std::filesystem::is_symlink(Destination)) + { + std::filesystem::remove(Destination); + } + + wil::unique_fd Fd{open(Destination, (O_CREAT | O_WRONLY), 0755)}; + THROW_LAST_ERROR_IF(!Fd); + + THROW_LAST_ERROR_IF(mount(Source, Destination, nullptr, (MS_RDONLY | MS_BIND), nullptr) < 0); + THROW_LAST_ERROR_IF(mount(nullptr, Destination, nullptr, (MS_RDONLY | MS_REMOUNT | MS_BIND), nullptr) < 0); + + return 0; +} +CATCH_RETURN_ERRNO(); + +int UtilMount(const char* Source, const char* Target, const char* Type, unsigned long MountFlags, const char* Options, std::optional TimeoutSeconds) + +/*++ + +Routine Description: + + This routine performs a mount with a retry and timeout. + +Arguments: + + Source - Supplies the source of the mount. + + Target - Supplies the target of the mount. + + Type - Supplies the filesystem type. + + MountFlags - Supplies mount flags. + + Options - Supplies the mount options. + + TimeoutSeconds - Supplies an optional retry timeout in seconds. + +Return Value: + + 0 on success, < 0 on failure. + +--*/ + +{ + // + // Ensure the mount point exists. + // + + if (UtilMkdirPath(Target, 0755) < 0) + { + return -1; + } + + // + // Mount the device to the mount point. + // + // N.B. The mount operation is retried if: + // - The mount source does not yet exist (hot-added devices) + // - For Plan9 (9p): device is busy or not found + // - For VirtioFS: invalid tag (device not ready) + // + // N.B. MS_SHARED must be applied in a separate mount() call, so it is + // stripped from the initial mount flags and applied after the mount. + // + + const unsigned long initialFlags = MountFlags & ~MS_SHARED; + + try + { + if (TimeoutSeconds.has_value()) + { + wsl::shared::retry::RetryWithTimeout( + [&]() { THROW_LAST_ERROR_IF(mount(Source, Target, Type, initialFlags, Options) < 0); }, + c_defaultRetryPeriod, + TimeoutSeconds.value(), + [&]() { + errno = wil::ResultFromCaughtException(); + + // Generic device not ready errors + if (errno == ENXIO || errno == EIO || errno == ENOENT) + { + return true; + } + + // Filesystem-specific device readiness errors + if (Type != nullptr) + { + if ((strcmp(Type, PLAN9_FS_TYPE) == 0 && errno == EBUSY) || (strcmp(Type, VIRTIO_FS_TYPE) == 0 && errno == EINVAL)) + { + return true; + } + } + + return false; + }); + } + else + { + THROW_LAST_ERROR_IF(mount(Source, Target, Type, initialFlags, Options) < 0); + } + } + catch (...) + { + errno = wil::ResultFromCaughtException(); + LOG_ERROR("mount({}, {}, {}, {:#x}, {}) failed {}", Source, Target, Type, MountFlags, Options, errno); + return -errno; + } + + // N.B. The shared flag must be applied in a separate mount() call. + if (WI_IsFlagSet(MountFlags, MS_SHARED)) + { + if (mount(nullptr, Target, nullptr, MS_SHARED, nullptr) < 0) + { + LOG_ERROR("Failed to make shared mount {} {}", Target, errno); + return -errno; + } + } + + return 0; +} + +int UtilMountOverlayFs(const char* Target, const char* Lower, unsigned long MountFlags, std::optional TimeoutSeconds) + +/*++ + +Routine Description: + + This routine mounts an overlayfs at the specified location. + +Arguments: + + Target - Supplies target for the overlayfs mount. + + Lower - Supplies the lower layer for the overlayfs mount. Multiple lower + layers can be specified by a colon-separated list. + + MountFlags - Supplies mount flags for the operation. + + TimeoutSeconds - Supplies an optional timeout if the mount should be retried. + +Return Value: + + 0 on success, < 0 on failure. + +--*/ + +try +{ + // + // Set up the state required for overlayfs mount: + // + // - mount point for read/write overlayfs (this happens last) + // /rw - tmpfs mount for upper and work dirs + // /rw/upper - upper dir + // /rw/work - work dir + // + + if (UtilMkdirPath(Target, 0755) < 0) + { + return -1; + } + + auto Path = std::format("{}/rw", Target); + + // + // Create a tmpfs mount for the read/write layer + // + + if (UtilMount(nullptr, Path.c_str(), "tmpfs", 0, nullptr) < 0) + { + return -1; + } + + // + // Create upper and work directories. + // + + Path = std::format("{}/rw/upper", Target); + if (UtilMkdir(Path.c_str(), 0755) < 0) + { + return -1; + } + + auto MountOptions = std::format("lowerdir={},upperdir={},", Lower, Path); + Path = std::format("{}/rw/work", Target); + if (UtilMkdir(Path.c_str(), 0755) < 0) + { + return -1; + } + + MountOptions += std::format("workdir={}", Path); + if (UtilMount(nullptr, Target, "overlay", MountFlags, MountOptions.c_str(), TimeoutSeconds) < 0) + { + return -1; + } + + return 0; +} +CATCH_RETURN_ERRNO() + +int UtilOpenMountNamespace(void) + +/*++ + +Routine Description: + + This routine opens a file descriptor to the current mount namespace. + +Arguments: + + None. + +Return Value: + + A file descriptor representing the current mount namespace, -1 on failure. + +--*/ + +{ + int Fd; + + Fd = open("/proc/self/ns/mnt", (O_RDONLY | O_CLOEXEC)); + if (Fd < 0) + { + LOG_ERROR("open failed {}", errno); + } + + return Fd; +} + +int UtilParseCgroupsLine(char* Line, char** SubsystemName, bool* Enabled) + +/*++ + +Routine Description: + + This routine parses a line from the /proc/cgroups file. The output + buffers will be pointers into the provided line buffer and does not need to + be freed. + + N.B. The Line buffer will be modified to insert NULL terminators. + +Arguments: + + Line - Supplies the line to parse. + + SubsystemName - Supplies a buffer to receive the subsystem name. + + Enabled - Supplies a buffer to receive true if the subsystem is enabled, + false otherwise. + +Return Value: + + 0 on success, -1 on failure. + +--*/ + +{ + char* Current; + int Field; + int Result; + + // + // Ignore comments. + // + + Current = strchr(Line, '#'); + if (Current != nullptr) + { + *Current = '\0'; + } + + for (Field = 0, Current = Line; ((Current != nullptr) && (Field <= WSL_CGROUPS_FIELD_MAX)); + Field += 1, Current = strchr(Current, WSL_CGROUPS_FIELD_SEP)) + { + // + // Replace field separators with NULL characters and skip past them. + // + + if (Field > 0) + { + *Current = '\0'; + Current += 1; + } + + switch (Field) + { + case WSL_CGROUPS_FIELD_SUBSYSTEM: + *SubsystemName = Current; + break; + + case WSL_CGROUPS_FIELD_ENABLED: + *Enabled = (*Current == '1'); + break; + } + } + + // + // Check if all the fields were found. If not, this is a malformed line. + // + + if (Field < WSL_CGROUPS_FIELD_MAX) + { + Result = -1; + goto ParseCgroupsLineEnd; + } + + Result = 0; + +ParseCgroupsLineEnd: + return Result; +} + +std::string UtilParsePlan9MountSource(std::string_view MountOptions) + +/*++ + +Routine Description: + + This routine parses the mount options to determine the actual source of a + a Plan 9 mount. + +Arguments: + + MountOptions - Supplies the mount options. + +Return Value: + + The mount source, or NULL if no valid option could be found. + +--*/ + +{ + // + // Search each option. + // + // N.B. The first option is always "ro" or "rw" so doesn't need to be + // considered. + // + + while (!MountOptions.empty()) + { + auto Current = UtilStringNextToken(MountOptions, WSL_MOUNT_OPTION_SEP); + if (wsl::shared::string::StartsWith(Current, PLAN9_ANAME_DRVFS)) + { + // + // Search for the sub path. + // + + auto MountSource = Current.substr(PLAN9_ANAME_DRVFS_LENGTH); + auto Index = Current.find(PLAN9_ANAME_PATH_OPTION); + if (Index == std::string_view::npos) + { + break; + } + + MountSource = Current.substr(Index + PLAN9_ANAME_PATH_OPTION_LENGTH); + MountSource = UtilStringNextToken(MountSource, PLAN9_ANAME_OPTION_SEP); + + // + // The value can only be used if it starts with a drive letter or + // the UNC prefix "UNC\" + // + + std::string Plan9Source; + if (MountSource.length() > 1 && isalpha(MountSource[0]) && MountSource[1] == DRIVE_SEP_NT) + { + Plan9Source = MountSource; + } + else if (wsl::shared::string::StartsWith(MountSource, PLAN9_UNC_TRANSLATED_PREFIX)) + { + Plan9Source = PLAN9_UNC_PREFIX; + Plan9Source += MountSource.substr(PLAN9_UNC_TRANSLATED_PREFIX_LENGTH); + } + else + { + break; + } + + // + // Ensure the returned path uses Windows path separators. + // + + UtilCanonicalisePathSeparator(Plan9Source, PATH_SEP_NT); + return Plan9Source; + } + } + + return {}; +} + +std::vector UtilParseWslEnv(char* NtEnvironment) + +/*++ + +Routine Description: + + This routine parses the WSLENV environment variable and constructs an + environment block with the resulting values. + +Arguments: + + NtEnvironment - Supplies an NT environment block. If NULL is provided, + the current processes's environment block is used. + +Return Value: + + The constructed env block. + +--*/ + +{ + std::optional EnvList; + bool Reverse = false; + + std::vector Output; + + auto Append = [&Output](const std::string_view& Content) { + for (auto e : Content) + { + Output.push_back(e); + } + }; + + Reverse = (NtEnvironment != nullptr); + + // + // Always add WSLENV to the block. + // + + Append(WSLENV_ENV "="); + + EnvList = UtilGetEnv(WSLENV_ENV, NtEnvironment); + if (EnvList.has_value()) + { + Append(EnvList.value()); + } + + Output.push_back('\0'); + if (EnvList.has_value()) + { + // + // Trim any whitespace from the end of the list. + // + + while (!EnvList->empty() && isspace(EnvList->back())) + { + EnvList->pop_back(); + } + + for (char *Sp, *EnvName = strtok_r(EnvList->data(), ":", &Sp); EnvName != nullptr; EnvName = strtok_r(NULL, ":", &Sp)) + { + char Mode = 0; + bool SkipTranslation = false; + char* Slash = strchr(EnvName, '/'); + if (Slash != nullptr) + { + *Slash = '\0'; + for (char* Flags = Slash + 1; *Flags != '\0'; Flags++) + { + switch (*Flags) + { + case 'p': // path + case 'l': // path list + if (Mode != 0 && Mode != 'p' && Mode != 'l') + { + SkipTranslation = true; + } + else + { + Mode = *Flags; + } + break; + + case 'u': // Win32 -> WSL translation only + if (Reverse == false) + { + SkipTranslation = true; + } + + break; + + case 'w': // WSL -> Win32 translation only + if (Reverse != false) + { + SkipTranslation = true; + } + + break; + + default: + + // + // Ignore entries with an unknown flag to support future + // extensibility. + // + + SkipTranslation = true; + break; + } + } + } + + auto EnvVal = UtilGetEnv(EnvName, NtEnvironment); + if (!SkipTranslation && EnvVal.has_value()) + { + switch (Mode) + { + case 'p': + case 'l': + { + + // + // Translate the path or path list. + // + + auto Result = UtilTranslatePathList(EnvVal->data(), Reverse); + if (Result.has_value()) + { + EnvVal = std::move(Result.value()); + } + else + { + SkipTranslation = true; + } + + break; + } + + default: + break; + } + } + + if (!SkipTranslation) + { + Append(std::format("{}={}", EnvName, EnvVal.value_or(""))); + Output.push_back('\0'); + } + } + } + + Output.push_back('\0'); + + return Output; +} + +int UtilProcessChildExitCode(int Status, const char* Name, int ExpectedStatus, bool PrintError) + +/*++ + +Routine Description: + + Handles the exit status of a child process. + +Arguments: + + Status - Supplies the exit status. + + Name - Supplies the process image name, for logging. + + ExpectedStatus - Supplies the expected exit status. + + PrintError - Supplies a boolean that specifies if an error should be printed if the process does not return the expected status. + +Return Value: + + 0 on success, -1 on failure. + +--*/ + +{ + if (WIFEXITED(Status)) + { + Status = WEXITSTATUS(Status); + if (Status == ExpectedStatus) + { + return 0; + } + } + else if (WIFSIGNALED(Status)) + { + LOG_ERROR("{} killed by signal {}", Name, WTERMSIG(Status)); + return -1; + } + + if (PrintError) + { + LOG_ERROR("{} returned {}", Name, Status); + } + + return -1; +} + +ssize_t UtilRead(int Fd, void* Buffer, size_t BufferSize, int Timeout) + +/*++ + +Routine Description: + + This routine reads a message from the given file descriptor with an optional + timeout. + +Arguments: + + Fd - Supplies a file descriptor. + + Buffer - Supplies a buffer. + + BufferSize - Supplies the size of the buffer in bytes. + + Timeout - Supplies a timeout in milliseconds. + +Return Value: + + The number of bytes read, -1 on failure. + +--*/ + +{ + // + // If a timeout was specified, use a pollfd. + // + + ssize_t Result = 0; + if (Timeout != -1) + { + pollfd PollDescriptor{Fd, POLLIN, 0}; + Result = poll(&PollDescriptor, 1, Timeout); + if ((Result <= 0) || ((PollDescriptor.revents & POLLIN) == 0)) + { + errno = ETIMEDOUT; + Result = -1; + } + } + + if (Result != -1) + { + Result = TEMP_FAILURE_RETRY(read(Fd, Buffer, BufferSize)); + } + + return Result; +} + +ssize_t UtilReadBuffer(int Fd, std::vector& Buffer, int Timeout) + +/*++ + +Routine Description: + + This routine reads a message from the given file descriptor and + automatically grows the buffer when needed. + +Arguments: + + Fd - Supplies a file descriptor. + + Buffer - Supplies a buffer; this buffer will be resized if needed. + + Timeout - Supplies a timeout in milliseconds. + +Return Value: + + The number of bytes read, -1 on failure. + +--*/ + +try +{ + ssize_t Result; + + UtilInitializeMessageBuffer(Buffer); + for (;;) + { + Result = UtilRead(Fd, Buffer.data(), Buffer.size(), Timeout); + if (Result < 0) + { + // + // When the message buffer is too small, EOVERFLOW is returned and + // the buffer size is doubled. + // + + if (errno == EOVERFLOW) + { + Buffer.resize(Buffer.size() * 2); + continue; + } + } + + break; + } + + return Result; +} +CATCH_RETURN_ERRNO() + +std::string UtilReadFile(FILE* File) + +/*++ + +Routine Description: + + This routine reads an entire file into a buffer. + +Arguments: + + File - Supplies a open file stream. + +Return Value: + + A std::string that contains the contents of the file. + +--*/ + +{ + char* Line = nullptr; + size_t LineLength; + std::string output; + + // + // Ensure the file is at the beginning of the stream. + // + + rewind(File); + + // + // Read the entire file into a buffer. + // + + LineLength = 0; + while (getline(&Line, &LineLength, File) != -1) + { + output += Line; + output += '\n'; + } + + return output; +} + +std::vector UtilReadFileRaw(const char* Path, size_t MaxSize) +{ + wil::unique_fd file{open(Path, O_RDONLY)}; + THROW_LAST_ERROR_IF(!file); + + constexpr auto bufferSize = 4096; + + size_t offset = 0; + std::vector buffer; + while (true) + { + buffer.resize(offset + bufferSize); + + int result = read(file.get(), buffer.data() + offset, bufferSize); + THROW_LAST_ERROR_IF(result < 0); + + if (result == 0) + { + break; + } + + offset += result; + + if (offset > MaxSize) + { + LOG_ERROR("File \"{}\" is too big. Maximum size: {}", Path, MaxSize); + THROW_ERRNO(E2BIG); + } + } + + buffer.resize(offset); + + return buffer; +} + +std::pair, std::optional> UtilReadFlavorAndVersion(const char* Path) +try +{ + // See reference format: https://www.freedesktop.org/software/systemd/man/latest/os-release.html + std::ifstream file; + file.open(Path); + + std::optional version; + std::optional flavor; + std::regex versionPattern("^VERSION_ID=\"?([a-zA-Z0-9\\-_\\.,]*)\"?$"); + std::regex flavorPattern("^ID=\"?([a-zA-Z0-9\\-_\\.,]*)\"?$"); + + std::string line; + while (file && std::getline(file, line) && (!version.has_value() || !flavor.has_value())) + { + std::smatch match; + if (std::regex_search(line, match, versionPattern)) + { + version = match.str(1); + } + else if (std::regex_search(line, match, flavorPattern)) + { + flavor = match.str(1); + } + } + + return std::make_pair(std::move(flavor), std::move(version)); +} +catch (...) +{ + LOG_CAUGHT_EXCEPTION(); + + return {}; +} + +ssize_t UtilReadMessageLxBus(int MessageFd, std::vector& Buffer, bool ShutdownOnDisconnect) + +/*++ + +Routine Description: + + This routine reads a message from the server. + +Arguments: + + MessageFd - Supplies a message port file descriptor. + + Buffer - Supplies a buffer; this buffer will be resized if needed. + + ShutdownOnDisconnect - Supplies true for shutdown on disconnect, false for + _exit. + +Return Value: + + The number of bytes read, -1 on failure. + +--*/ + +try +{ + UtilInitializeMessageBuffer(Buffer); + wil::unique_fd Epoll{epoll_create1(EPOLL_CLOEXEC)}; + if (!Epoll) + { + FATAL_ERROR("Failed to create epoll {}", errno); + } + + epoll_event EpollEvent{}; + EpollEvent.events = EPOLLIN | EPOLLHUP; + EpollEvent.data.fd = MessageFd; + if (epoll_ctl(Epoll.get(), EPOLL_CTL_ADD, MessageFd, &EpollEvent) < 0) + { + FATAL_ERROR("Failed epoll_ctl {}", errno); + } + + ssize_t BytesRead; + for (;;) + { + // + // Message port read/write operations are blocking, so use an epoll to + // allow any incoming signals to be processed while waiting for the + // message. + // + + if (TEMP_FAILURE_RETRY(epoll_wait(Epoll.get(), &EpollEvent, 1, -1)) != 1) + { + FATAL_ERROR("Failed epoll_wait {}", errno); + } + + BytesRead = TEMP_FAILURE_RETRY(read(MessageFd, Buffer.data(), Buffer.size())); + if (BytesRead >= static_cast(sizeof(MESSAGE_HEADER))) + { + break; + } + + if (BytesRead < 0) + { + // + // When the message buffer is too small, EOVERFLOW is returned and + // the buffer is increased. When the Windows server disconnects + // EPIPE is returned and the process handles the disconnect. + // + + if (errno == EOVERFLOW) + { + auto BufferSizeNew = *(reinterpret_cast(Buffer.data())); + if (BufferSizeNew <= Buffer.size()) + { + BufferSizeNew = Buffer.size() * 2; + } + + Buffer.resize(BufferSizeNew); + continue; + } + + if (errno == EPIPE) + { + if (ShutdownOnDisconnect == false) + { + _exit(0); + } + } + + FATAL_ERROR("Failed to read message {}", errno); + break; + } + + FATAL_ERROR("Unexpected message size {}", BytesRead); + break; + } + + return BytesRead; +} +CATCH_RETURN_ERRNO() + +int UtilRestoreBlockedSignals() +{ + return sigprocmask(SIG_SETMASK, &g_originalSignals, nullptr); +} + +int UtilSaveBlockedSignals(const sigset_t& SignalMask) +{ + return sigprocmask(SIG_BLOCK, &SignalMask, &g_originalSignals); +} + +// Returns true for signals that should not be saved/restored: +// SIGKILL/SIGSTOP — not settable per POSIX. +// SIGCONT — left at default to allow process resumption. +// SIGHUP — handled separately by the caller. +// 32-34 — internal NPTL signals (__SIGRTMIN through __SIGRTMIN+2) reserved +// by glibc for thread cancellation and other runtime use. +static bool SkipSignal(unsigned int Signal) +{ + switch (Signal) + { + case SIGKILL: + case SIGSTOP: + case SIGCONT: + case SIGHUP: + case 32: + case 33: + case 34: + return true; + + default: + return false; + } +} + +int UtilSaveSignalHandlers(struct sigaction* SavedSignalActions) + +/*++ + +Routine Description: + + This routine saves all settable signal handlers, skipping signals + listed in SkipSignal() (non-settable, SIGHUP, and internal NPTL signals). + +Arguments: + + SavedSignalActions - Supplies an array to save default signal actions. + +Return Value: + + 0 on success, -1 on failure. + +--*/ + +{ + for (unsigned int Index = 1; Index < _NSIG; Index += 1) + { + if (SkipSignal(Index)) + { + continue; + } + + if (sigaction(Index, NULL, &SavedSignalActions[Index]) < 0) + { + FATAL_ERROR("sigaction ({}) query failed {}", Index, errno); + } + } + + return 0; +} + +int UtilSetSignalHandlers(struct sigaction* SavedSignalActions, bool Ignore) + +/*++ + +Routine Description: + + This routine sets all settable signal handlers to the given handler, + skipping signals listed in SkipSignal(). + +Arguments: + + SavedSignalActions - Supplies an array of signal handlers to set. + + Ignore - Supplies a boolean specifying if signals should be ignored. + +Return Value: + + 0 on success, -1 on failure. + +--*/ + +{ + struct sigaction SignalAction; + + for (unsigned int Index = 1; Index < _NSIG; Index += 1) + { + if (SkipSignal(Index)) + { + continue; + } + + memcpy(&SignalAction, &SavedSignalActions[Index], sizeof(SignalAction)); + if (Ignore != false) + { + SignalAction.sa_handler = SIG_IGN; + } + + if (sigaction(Index, &SignalAction, NULL) < 0) + { + FATAL_ERROR("sigaction ({}) set failed {}", Index, errno); + } + } + + return 0; +} + +void UtilSetThreadName(const char* Name) +{ + g_threadName = Name; + + if (prctl(PR_SET_NAME, reinterpret_cast(Name), 0, 0, 0) < 0) + { + LOG_ERROR("prctl failed {}", errno); + } +} + +void UtilSocketShutdown(int Fd, int How) + +/*++ + +Routine Description: + + This routine cleanly shuts down a socket. + +Arguments: + + Fd - Supplies a socket file descriptor. + + How - Supplies the type of shutdown. + +Return Value: + + None. + +--*/ + +{ + if (shutdown(Fd, How) < 0) + { + LOG_ERROR("shutdown({}) failed {}", How, errno); + } +} + +bool UtilSizeTAdd(size_t Left, size_t Right, size_t* Out) + +/*++ + +Routine Description: + + This routine checks if overflow will occur when adding two size_t values and + if overflow is not possible, adds the values. + +Arguments: + + Left - Supplies the first value. + + Right - Supplies the second value. + + Out - Supplies a pointer to receive sum on success. + +Return Value: + + true if addition was done without overflow, false otherwise. + +--*/ + +{ + bool Success = false; + if (Right > (Right + Left)) + { + goto SizeTAddExit; + } + + *Out = Left + Right; + Success = true; + +SizeTAddExit: + return Success; +} + +std::string_view UtilStringNextToken(std::string_view& View, std::string_view Separators) + +/*++ + +Routine Description: + + This routine extracts the next token identified by one of the specified + separators. + +Arguments: + + View - Supplies the string view to tokenize. On return, this parameter is + modified to contain the remaining portion of the string after the + separator, or an empty string if no separator was found. + + Separators - Supplies the separators that appear between the tokens. + +Return Value: + + The contents of the string up to the next separator, or the entire string + if no separator was found. + +--*/ + +{ + std::string_view Result; + auto Pos = View.find_first_of(Separators); + if (Pos == std::string_view::npos) + { + Result = View; + View = {}; + } + else + { + Result = View.substr(0, Pos); + View = View.substr(Pos + 1); + } + + return Result; +} + +std::string_view UtilStringNextToken(std::string_view& View, char Separator) + +/*++ + +Routine Description: + + This routine extracts the next token identified by the specified separator. + +Arguments: + + View - Supplies the string view to tokenize. On return, this parameter is + modified to contain the remaining portion of the string after the + separator, or an empty string if no separator was found. + + Separators - Supplies the separator that appears between the tokens. + +Return Value: + + The contents of the string up to the next separator, or the entire string + if no separator was found. + +--*/ + +{ + std::string_view Result; + auto Pos = View.find_first_of(Separator); + if (Pos == std::string_view::npos) + { + Result = View; + View = {}; + } + else + { + Result = View.substr(0, Pos); + View = View.substr(Pos + 1); + } + + return Result; +} + +std::optional UtilTranslatePathList(char* PathList, bool IsNtPathList) + +/*++ + +Routine Description: + + This routine translates a semicolon-separated list of NT paths into a + colon-separated list of Linux paths. + +Arguments: + + PathList - Supplies the semicolon-separated list of paths to translate. + + IsNtPathList - Supplies a boolean specifying if the list contains NT-style + paths. + + LxPath - Supplies a buffer to receive an allocated string with the + translated path list. + +Return Value: + + 0 on success, -error for failure. + +--*/ + +{ + char Mode; + const char* SourceSeparator; + char TargetSeparator; + std::string TranslatedList; + + if (IsNtPathList != false) + { + Mode = TRANSLATE_MODE_UNIX; + SourceSeparator = ";"; + TargetSeparator = ':'; + } + else + { + Mode = TRANSLATE_MODE_WINDOWS; + SourceSeparator = ":"; + TargetSeparator = ';'; + } + + // + // Translate each element in the list. If an element in the list fails to + // translate, ignore it. + // + + for (char *Sp, *Path = strtok_r(PathList, SourceSeparator, &Sp); Path != nullptr; Path = strtok_r(NULL, SourceSeparator, &Sp)) + { + // + // Skip relative Windows paths. + // + + if ((Mode == TRANSLATE_MODE_UNIX) && (!UtilIsAbsoluteWindowsPath(Path))) + { + continue; + } + + std::string TranslatedPath = WslPathTranslate(Path, 0, Mode); + if (TranslatedPath.empty()) + { + auto WarningMessage = wsl::shared::Localization::MessageFailedToTranslate(Path); + if (wil::ScopedWarningsCollector::CanCollectWarning()) + { + EMIT_USER_WARNING(std::move(WarningMessage)); + } + else + { + LOG_WARNING("{}", WarningMessage); + } + + continue; + } + + if (!TranslatedList.empty()) + { + TranslatedList += TargetSeparator; + } + + TranslatedList += TranslatedPath; + } + + if (TranslatedList.empty()) + { + return {}; + } + + return TranslatedList; +} + +std::string UtilWinPathTranslate(const char* Path, bool Reverse) + +/*++ + +Routine Description: + + This routine translates an absolute Linux path or an absolute + Windows path into the other. + +Arguments: + + Path - Supplies the path to translate. + + Reverse - Supplies a bool, if set perform translation from Windows->Linux + path; otherwise, translation from Linux->Windows path. + +Return Value: + + 0 on success, -error for general failure. + +--*/ + +try +{ + size_t PathLength; + size_t PrefixLength; + char* Suffix; + size_t SuffixLength; + size_t TranslatedLength; + size_t TranslatedSuffixLength; + + // + // Find if there is a DrvFs or Plan 9 mount for the specified path. + // + + auto PrefixReplacement = UtilFindMount(MOUNT_INFO_FILE, Path, Reverse, &PrefixLength); + if (PrefixReplacement.empty()) + { + // + // The path is not part of a DrvFs or Plan 9 mount, so the path should + // be translated to use the plan9 redirector. + // + + return UtilWinPathTranslateInternal(Path, Reverse); + } + + // + // If translating Linux to Windows, see if any characters need to be + // escaped. + // + + PathLength = strlen(Path); + SuffixLength = PathLength - PrefixLength; + if (Reverse == false) + { + // + // If the suffix is empty and the replacement prefix is just a drive + // letter, make space to append a backslash. + // + + if ((SuffixLength == 0) && (PrefixReplacement.length() == 2) && (PrefixReplacement[1] == DRIVE_SEP_NT)) + { + TranslatedSuffixLength = 1; + } + else + { + TranslatedSuffixLength = EscapePathForNtLength(&Path[PrefixLength]); + } + } + else + { + TranslatedSuffixLength = SuffixLength; + } + + // + // Construct the new path out of the replacement prefix and the remainder + // of the path, escaping it if necessary. + // + + std::string TranslatedPath{PrefixReplacement}; + TranslatedLength = PrefixReplacement.length() + TranslatedSuffixLength; + TranslatedPath.resize(TranslatedLength, '\0'); + Suffix = &TranslatedPath[PrefixReplacement.length()]; + if (TranslatedSuffixLength != SuffixLength) + { + // + // If the suffix is empty and the replacement prefix is just a drive + // letter, append a backslash. Otherwise, escape and append the + // suffix. + // + + if ((SuffixLength == 0) && (TranslatedSuffixLength == 1)) + { + *Suffix = PATH_SEP_NT; + } + else + { + EscapePathForNt(&Path[PrefixLength], Suffix); + } + } + else + { + memcpy(Suffix, &Path[PrefixLength], SuffixLength); + + // + // Make sure the translated path uses the correct separators. + // + // N.B. This is done by the escape method if escaping is necessary. + // + + UtilCanonicalisePathSeparator(Suffix, Reverse ? PATH_SEP : PATH_SEP_NT); + } + + return TranslatedPath; +} +catch (...) +{ + LOG_CAUGHT_EXCEPTION(); + return {}; +} + +std::string UtilWinPathTranslateInternal(const char* Path, bool Reverse) + +/*++ + +Routine Description: + + This routine translates an absolute Linux path or an absolute + Windows path into the other. + +Arguments: + + Path - Supplies the path to translate. + + Reverse - Supplies a bool, if set perform translation from Windows->Linux + path; otherwise, translation from Linux->Windows path. + +Return Value: + + 0 on success, -1 on failure. + +--*/ + +try +{ + // + // Get the distribution name from the environment variable. + // + + const auto DistributionName = UtilGetEnvironmentVariable(WSL_DISTRO_NAME_ENV); + if (DistributionName.empty()) + { + return {}; + } + + // + // Construct a prefix (\\wsl.localhost\DistributionName). + // + + std::string Prefix{PLAN9_RDR_PREFIX}; + Prefix += DistributionName; + + // + // For Windows to Linux translation, concatenate the prefix and the escaped + // version of the path. For Linux to Windows translation, ensure the path + // begins with the prefix, remove the prefix, and unescape the path. + // + + std::string TranslatedPath{}; + if (Reverse == false) + { + TranslatedPath += Prefix; + const size_t EscapedPathLength = EscapePathForNtLength(Path); + std::string EscapedPath(EscapedPathLength, '\0'); + EscapePathForNt(Path, EscapedPath.data()); + TranslatedPath += EscapedPath; + } + else + { + auto matchesPrefix = [Path](const std::string_view& prefix) { + if (!wsl::shared::string::StartsWith(Path, prefix, true)) + { + return false; + } + + // Validate that the next character is a path separator or the end of the string to prevent matching other distribution paths like: + // \\wsl.localhost\- + + auto nextChar = Path[prefix.size()]; + return nextChar == '\0' || nextChar == PATH_SEP || nextChar == PATH_SEP_NT; + }; + + auto PrefixLength = Prefix.length(); + if (!matchesPrefix(Prefix)) + { + // + // Check the old \\wsl$ prefix if it's not \\wsl.localhost. + // + + std::string CompatPrefix{PLAN9_RDR_COMPAT_PREFIX}; + CompatPrefix += DistributionName; + if (!matchesPrefix(CompatPrefix)) + { + return {}; + } + + PrefixLength = CompatPrefix.length(); + } + + Path += PrefixLength; + if (strlen(Path) == 0) + { + TranslatedPath += PATH_SEP; + } + else + { + TranslatedPath += Path; + + // + // Canonicalize the path separators and unescape the string. + // + + UtilCanonicalisePathSeparator(TranslatedPath.data(), PATH_SEP); + UnescapePathInplace(TranslatedPath.data()); + } + } + + return TranslatedPath; +} +catch (...) +{ + LOG_CAUGHT_EXCEPTION(); + return {}; +} + +ssize_t UtilWriteBuffer(int Fd, gsl::span Buffer) + +/*++ + +Routine Description: + + This routine writes an entire buffer to the given file descriptor. + +Arguments: + + Fd - Supplies a file descriptor. + + Buffer - Supplies the buffer to write. + +Return Value: + + The total number of bytes written, -1 on failure. + +--*/ + +{ + return UtilWriteBuffer(Fd, Buffer.data(), Buffer.size()); +} + +ssize_t UtilWriteBuffer(int Fd, const void* Buffer, size_t BufferSize) + +/*++ + +Routine Description: + + This routine writes an entire buffer to the given file descriptor. + +Arguments: + + Fd - Supplies a file descriptor. + + Buffer - Supplies a buffer pointer. + + BufferSize - Supplies the buffer size. + +Return Value: + + The total number of bytes written, -1 on failure. + +--*/ + +{ + ssize_t BytesWritten; + ssize_t Result; + ssize_t TotalBytesWritten; + auto* Offset = static_cast(Buffer); + + Result = -1; + TotalBytesWritten = 0; + do + { + BytesWritten = TEMP_FAILURE_RETRY(write(Fd, Offset, BufferSize)); + if (BytesWritten < 0) + { + goto WriteBufferExit; + } + + BufferSize -= BytesWritten; + Offset += BytesWritten; + TotalBytesWritten += BytesWritten; + } while (BufferSize > 0); + + Result = TotalBytesWritten; + +WriteBufferExit: + return Result; +} + +ssize_t UtilWriteStringView(int Fd, std::string_view StringView) + +/*++ + +Routine Description: + + This routine writes a string view to the given file descriptor. + +Arguments: + + Fd - Supplies a file descriptor. + + StringView - Supplies the string view to write. + +Return Value: + + The total number of bytes written, -1 on failure. + +--*/ + +{ + return UtilWriteBuffer(Fd, StringView.data(), StringView.size()); +} + +std::wstring UtilReadFileContentW(std::string_view path) +{ + std::wifstream file; + file.exceptions(std::ifstream::failbit | std::ifstream::badbit); + file.open(path); + + return {std::istreambuf_iterator(file), {}}; +} + +std::string UtilReadFileContent(std::string_view path) +{ + std::ifstream file; + file.exceptions(std::ifstream::failbit | std::ifstream::badbit); + file.open(path); + + return {std::istreambuf_iterator(file), {}}; +} + +HvPciSwiotlbPool UtilReadHvPciSwiotlbPool() +{ + HvPciSwiotlbPool pool{}; + try + { + pool.Base = std::stoull(UtilReadFileContent("/sys/bus/vmbus/drivers/hv_pci/swiotlb_base"), nullptr, 0); + pool.Size = std::stoull(UtilReadFileContent("/sys/bus/vmbus/drivers/hv_pci/swiotlb_size"), nullptr, 0); + } + catch (...) + { + pool = {}; + } + + return pool; +} + +uint16_t UtilWinAfToLinuxAf(uint16_t WinAddressFamily) +{ + uint16_t LinuxAddressFamily = AF_UNSPEC; + + switch (WinAddressFamily) + { + case 2: + LinuxAddressFamily = AF_INET; + break; + case 23: + LinuxAddressFamily = AF_INET6; + break; + } + + return LinuxAddressFamily; +} + +int WriteToFile(const char* Path, const char* Content, int OpenFlags, int Permissions) + +/*++ + +Routine Description: + + Write content to the specified file. + +Arguments: + + Path - Supplies the path to the file to write. + + Content - Supplies the content to be written to the file. + + OpenFlags - Supplies the flags passed to open(). + + Permissions - Supplies the file mode used when O_CREAT causes the file to be created. + +Return Value: + + 0 on success, -1 on failure. + +--*/ + +{ + wil::unique_fd Fd{open(Path, OpenFlags, Permissions)}; + if (!Fd) + { + int errnoPrev = errno; + LOG_ERROR("open({}) failed {}", Path, errno); + errno = errnoPrev; + return -1; + } + + std::string_view Buffer{Content}; + auto Result = UtilWriteStringView(Fd.get(), Buffer); + if (Result != Buffer.size()) + { + int errnoPrev = errno; + LOG_ERROR("write({}, {}) failed {} {}", Path, Content, Result, errno); + errno = errnoPrev; + return -1; + } + + return 0; +} + +int ProcessCreateProcessMessage(wsl::shared::Transaction& Transaction, gsl::span Buffer) +{ + auto* Message = gslhelpers::try_get_struct(Buffer); + if (!Message) + { + LOG_ERROR("Unexpected message size {}", Buffer.size()); + return -1; + } + + auto sendResult = [&](unsigned long Result) { Transaction.SendResultMessage(Result); }; + + sockaddr_vm SocketAddress{}; + wil::unique_fd ListenSocket{UtilListenVsockAnyPort(&SocketAddress, 1, false)}; + THROW_LAST_ERROR_IF(!ListenSocket); + + sendResult(SocketAddress.svm_port); + + // Always return the execution result, since the service expects it + int execResult = -1; + auto sendExecResult = wil::scope_exit([&]() { sendResult(execResult); }); + + const char* Path = wsl::shared::string::FromSpan(Buffer, Message->PathIndex); + const char* Arguments = wsl::shared::string::FromSpan(Buffer, Message->CommandLineIndex); + + // Note: this makes the assumption that no empty arguments are in the message + std::vector ArgumentArray; + while (*Arguments != '\0') + { + ArgumentArray.emplace_back(Arguments); + Arguments += strlen(Arguments) + 1; + } + ArgumentArray.emplace_back(nullptr); + + auto ControlPipe = wil::unique_pipe::create(O_CLOEXEC); + + const int ChildPid = UtilCreateChildProcess("CreateChildProcess", [&]() { + try + { + wil::unique_fd ProcessSocket{UtilAcceptVsock(ListenSocket.get(), SocketAddress, SESSION_LEADER_ACCEPT_TIMEOUT_MS)}; + THROW_LAST_ERROR_IF(!ProcessSocket); + + THROW_LAST_ERROR_IF(dup2(ProcessSocket.get(), STDIN_FILENO) < 0); + THROW_LAST_ERROR_IF(dup2(ProcessSocket.get(), STDOUT_FILENO) < 0); + execv(Path, (char* const*)(ArgumentArray.data())); + + // If this point is reached, an error needs to be reported back since execv() failed. + THROW_LAST_ERROR(); + } + catch (...) + { + auto error = wil::ResultFromCaughtException(); + LOG_ERROR("Command execution failed: {}", errno); + + if (write(ControlPipe.write().get(), &error, sizeof(error)) != sizeof(error)) + { + LOG_ERROR("Failed to write command execution status: {}", errno); + } + } + }); + + THROW_LAST_ERROR_IF(ChildPid < 0); + ControlPipe.write().reset(); + + int ReadResult = TEMP_FAILURE_RETRY(read(ControlPipe.read().get(), &execResult, sizeof(execResult))); + THROW_LAST_ERROR_IF(ReadResult < 0); + + // If the pipe closed without data, then exec() was successful + if (ReadResult == 0) + { + execResult = 0; + } + else if (execResult == sizeof(execResult)) + { + // Otherwise, return the error code to the service + execResult = abs(execResult); + } + else + { + execResult = EINVAL; + } + + return 0; +} + +#define RECLAIM_PATH CGROUP_MOUNTPOINT "/memory.reclaim" + +static bool ReadProcFile(const char* Path, char* Buffer, size_t Size, bool LogErrors = true) + +/*++ + +Routine Description: + + This routine reads the full contents of a procfs file into a caller-supplied + NUL-terminated buffer. Because procfs content is generated on read and a + single read() may return only a partial snapshot, this loops until EOF (or + the buffer fills), which keeps later-appearing fields (for example the + workingset counters deep in /proc/vmstat) from being truncated away. + +Arguments: + + Path - Supplies the procfs path to read. + + Buffer - Supplies the buffer to fill; always NUL-terminated on success. + + Size - Supplies the size of Buffer in bytes (must be at least 1). + + LogErrors - Supplies whether open/read failures are logged. Callers for + which absence is normal (for example PSI) pass false. + +Return Value: + + true on success (Buffer holds the content), false on failure. + +--*/ + +{ + if (Size == 0) + { + return false; + } + + wil::unique_fd Fd{open(Path, O_RDONLY | O_CLOEXEC)}; + if (!Fd) + { + if (LogErrors) + { + LOG_ERROR("open({}) failed {}", Path, errno); + } + + return false; + } + + size_t Total = 0; + while (Total < (Size - 1)) + { + const ssize_t Result = TEMP_FAILURE_RETRY(read(Fd.get(), Buffer + Total, (Size - 1) - Total)); + if (Result < 0) + { + if (LogErrors) + { + LOG_ERROR("read({}) failed {}", Path, errno); + } + + return false; + } + + if (Result == 0) + { + break; + } + + Total += static_cast(Result); + } + + Buffer[Total] = '\0'; + return Total > 0; +} + +static bool ReadAggregateCpuTimes(unsigned long long& Busy, unsigned long long& Idle) + +/*++ + +Routine Description: + + This routine parses the aggregate "cpu" line of /proc/stat and splits the + cumulative jiffies into busy and idle buckets. + + Unlike a naive "user time only" measurement, idle time is taken as idle + + iowait and everything else (user, nice, system, irq, softirq, steal) counts + as busy. This ensures kernel-bound work -- background daemons, I/O, niced + builds -- correctly keeps the VM out of the idle state. + +Arguments: + + Busy - Receives the cumulative busy jiffies across all cores. + + Idle - Receives the cumulative idle jiffies (idle + iowait) across all cores. + +Return Value: + + true on success, false on failure. + +--*/ + +{ + // + // The aggregate cpu line easily fits in this buffer. + // + + char Buffer[256]; + if (!ReadProcFile("/proc/stat", Buffer, sizeof(Buffer))) + { + return false; + } + + // + // Format: "cpu user nice system idle iowait irq softirq steal guest guest_nice". + // The guest fields are already accounted for in user/nice and are ignored here. + // + + unsigned long long Fields[8] = {}; + const int Parsed = sscanf( + Buffer, "cpu %llu %llu %llu %llu %llu %llu %llu %llu", &Fields[0], &Fields[1], &Fields[2], &Fields[3], &Fields[4], &Fields[5], &Fields[6], &Fields[7]); + + if (Parsed < 5) + { + LOG_ERROR("failed to parse /proc/stat cpu line (parsed {})", Parsed); + return false; + } + + Idle = Fields[3] + Fields[4]; + Busy = 0; + for (int Index = 0; Index < Parsed; Index += 1) + { + if (Index != 3 && Index != 4) + { + Busy += Fields[Index]; + } + } + + return true; +} + +static long long GetReclaimableCacheBytes() + +/*++ + +Routine Description: + + This routine returns the amount of reclaimable file-backed page cache (in + bytes) by parsing /proc/meminfo. + + It deliberately counts only memory that cache reclaim can actually return to + the host: file-backed page cache (Active(file) + Inactive(file)) plus + reclaimable slab (SReclaimable). Anonymous memory is excluded because neither + drop_caches nor cgroup reclaim of clean cache can free it, so it must not + drive the reclaim trigger. + +Arguments: + + None. + +Return Value: + + Reclaimable cache in bytes, or -1 on failure. + +--*/ + +{ + char Buffer[4096]; + if (!ReadProcFile("/proc/meminfo", Buffer, sizeof(Buffer))) + { + return -1; + } + + // + // All values in /proc/meminfo are reported in kB. + // + + long long TotalKb = 0; + char* Save = nullptr; + for (char* Line = strtok_r(Buffer, "\n", &Save); Line != nullptr; Line = strtok_r(nullptr, "\n", &Save)) + { + const char* Value = nullptr; + if (strncmp(Line, "Active(file):", 13) == 0) + { + Value = Line + 13; + } + else if (strncmp(Line, "Inactive(file):", 15) == 0) + { + Value = Line + 15; + } + else if (strncmp(Line, "SReclaimable:", 13) == 0) + { + Value = Line + 13; + } + + if (Value != nullptr) + { + TotalKb += strtoll(Value, nullptr, 10); + } + } + + return TotalKb * 1024; +} + +static long long GetFreeMemoryBytes() + +/*++ + +Routine Description: + + This routine returns the amount of free memory (in bytes) currently held in + the buddy allocator, used to decide when there are newly-freed pages worth + compacting -- whether they were freed by reclaim or by a process exiting. + +Arguments: + + None. + +Return Value: + + Free memory in bytes, or -1 on failure. + +--*/ + +{ + struct sysinfo Info = {}; + if (sysinfo(&Info) < 0) + { + LOG_ERROR("sysinfo failed {}", errno); + return -1; + } + + return static_cast(Info.freeram) * Info.mem_unit; +} + +namespace { + +// +// Tunables for the memory reduction thread. +// + +constexpr auto c_pollInterval = std::chrono::seconds(10); + +// An interval is CPU-idle when less than this fraction (per-mille) of aggregate CPU time was spent on +// non-idle work. +constexpr unsigned long long c_busyThresholdPerMille = 5; // 0.5% + +// DropCache: drop only after this many consecutive idle intervals, then re-drop once the reclaimable +// cache grows by at least the re-arm threshold. +constexpr int c_dropCacheIdleIntervals = 30; // 5 minutes +constexpr long long c_cacheGrowthRearmBytes = 256ll * 1024 * 1024; + +// Gradual: reclaimable cache below this floor is always retained; only the excess above it (beyond a +// hysteresis margin) is reclaimed. +constexpr long long c_floorBaseBytes = 128ll * 1024 * 1024; +constexpr long long c_gradualHysteresisBytes = 128ll * 1024 * 1024; + +// Compaction runs once free memory grows by at least this much since the last compaction. +constexpr long long c_compactFreeGrowthBytes = 256ll * 1024 * 1024; + +// +// Mutable state carried across polling intervals by the reduction thread. +// + +struct MemoryReclaimState +{ + // CPU sampling. + unsigned long long PreviousBusy = 0; + unsigned long long PreviousIdle = 0; + bool HavePreviousSample = false; + + // DropCache. + int IdleStreak = 0; + bool ReclaimedThisIdlePeriod = false; + long long CacheAtLastDrop = 0; + + // Compaction. + long long FreeAtLastCompaction = 0; +}; + +bool RequestCgroupReclaim(long long Bytes) + +/*++ + +Routine Description: + + Best-effort write of a byte count to the cgroup memory.reclaim knob. EAGAIN is an expected outcome + (the kernel freed some, but not all, of the requested pages this pass) and is treated as a successful + reclaim *without* logging, so the long-lived reduction thread does not emit an error every interval. + A transient write failure never throws and tears down the thread. + +Arguments: + + Bytes - Supplies the number of bytes to request the kernel reclaim. + +Return Value: + + true if pages were reclaimed (full success or EAGAIN), false on an unexpected error. + +--*/ + +{ + const std::string Request = std::to_string(Bytes); + + wil::unique_fd Fd{open(RECLAIM_PATH, O_WRONLY | O_CLOEXEC)}; + if (!Fd) + { + LOG_ERROR("open({}) failed {}", RECLAIM_PATH, errno); + return false; + } + + const std::string_view Buffer{Request}; + if (UtilWriteStringView(Fd.get(), Buffer) == static_cast(Buffer.size())) + { + return true; + } + + // + // EAGAIN means the kernel could not evict the full amount this pass (pages were still freed), which + // is normal under reclaim, so it is not logged. + // + + if (errno == EAGAIN) + { + return true; + } + + LOG_ERROR("write({}, {}) failed {}", RECLAIM_PATH, Request, errno); + return false; +} + +bool RunGradualTick(MemoryReclaimState& State, bool IntervalIdle) + +/*++ + +Routine Description: + + Runs one interval of gentle reclaim (cold-first via cgroup memory.reclaim). Reclaim is gated on CPU + idle and drains reclaimable page cache down toward a fixed floor, leaving a hysteresis margin so it + does not churn near the floor. + +Return Value: + + true if memory was reclaimed this interval, false otherwise. + +--*/ + +{ + (void)State; + + if (!IntervalIdle) + { + return false; + } + + const long long Cache = GetReclaimableCacheBytes(); + if (Cache < 0) + { + return false; + } + + const long long Excess = Cache - c_floorBaseBytes; + if (Excess <= c_gradualHysteresisBytes) + { + return false; + } + + // Best-effort: RequestCgroupReclaim never throws and does not log the expected EAGAIN, so a transient + // write error cannot tear down the long-lived reduction thread. + return RequestCgroupReclaim(Excess); +} + +bool RunDropCacheTick(MemoryReclaimState& State, bool IntervalIdle) + +/*++ + +Routine Description: + + Runs one interval of DropCache policy: gated on sustained CPU idle because drop_caches is + indiscriminate (it evicts hot and cold pages alike). Drops once on becoming idle, then re-drops only + after the reclaimable cache grows meaningfully. + +Return Value: + + true if the cache was dropped this interval, false otherwise. + +--*/ + +{ + if (!IntervalIdle) + { + State.IdleStreak = 0; + State.ReclaimedThisIdlePeriod = false; + return false; + } + + if (++State.IdleStreak < c_dropCacheIdleIntervals) + { + return false; + } + + const long long Cache = GetReclaimableCacheBytes(); + if (Cache >= 0 && (!State.ReclaimedThisIdlePeriod || Cache > State.CacheAtLastDrop + c_cacheGrowthRearmBytes)) + { + // Best-effort; WriteToFile logs internally on failure. A failed drop must not tear down the + // long-lived reduction thread. + if (WriteToFile("/proc/sys/vm/drop_caches", "1\n") == 0) + { + const long long After = GetReclaimableCacheBytes(); + State.CacheAtLastDrop = (After < 0) ? 0 : After; + State.ReclaimedThisIdlePeriod = true; + return true; + } + } + + return false; +} + +void RunCompactionTick(MemoryReclaimState& State, bool Reclaimed) + +/*++ + +Routine Description: + + Compacts when there are newly-freed pages worth coalescing -- from our reclaim or from a process + exiting -- so free-page reporting can hand back large blocks. Tracks downward movement of free memory + so a later rise re-triggers compaction. + +--*/ + +{ + const long long Free = GetFreeMemoryBytes(); + bool Compact = Reclaimed; + if (Free >= 0) + { + if (Free > State.FreeAtLastCompaction + c_compactFreeGrowthBytes) + { + Compact = true; + } + + if (Free < State.FreeAtLastCompaction) + { + State.FreeAtLastCompaction = Free; + } + } + + if (Compact) + { + // Best-effort; WriteToFile logs internally on failure. A failed compaction must not tear down the + // long-lived reduction thread; leave FreeAtLastCompaction unchanged so the next tick retries. + if (WriteToFile("/proc/sys/vm/compact_memory", "1\n") == 0 && Free >= 0) + { + State.FreeAtLastCompaction = Free; + } + } +} + +} // namespace + +void StartMemoryReductionThread(LX_MINI_INIT_MEMORY_RECLAIM_MODE Mode) + +/*++ + +Routine Description: + + This routine starts a background thread that reclaims memory and compacts free pages so the maximum + number of pages can be discarded back to the host. + + The policy is: + + 1. Gradual mode (gentle, cold-first via cgroup memory.reclaim) is gated on sustained CPU idle and + drains reclaimable page cache down toward a fixed floor, leaving a hysteresis margin so it does + not churn near the floor. + + 2. DropCache mode (the indiscriminate sledgehammer: drop_caches evicts the entire page cache, + hot and cold alike) cannot run safely under load, so it stays gated on sustained CPU idle. It + drops once on becoming idle, then re-drops only after the cache grows meaningfully. + + 3. Compaction is gated on free-memory *growth*: it runs when there are newly-freed pages worth + coalescing -- whether they were freed by our reclaim or by a process exiting -- and is skipped + on ticks where nothing new was freed. This both avoids the previous "compact every tick" waste + and ensures naturally-freed memory still gets coalesced for efficient page reporting. + + CPU utilization is measured over each interval using all non-idle CPU time (user, system, irq, + softirq, steal) rather than just user time, so kernel-bound work keeps the VM out of the idle state. + +Arguments: + + Mode - Supplies the memory reclaim mode. + +Return Value: + + None. + +--*/ + +try +{ + if (Mode == LxMiniInitMemoryReclaimModeDisabled) + { + return; + } + + std::thread([Mode = Mode]() mutable { + try + { + // + // Run at idle scheduling priority so reclaim never competes with real work. + // + + sched_param Parameter{}; + Parameter.sched_priority = 0; + const int Result = pthread_setschedparam(pthread_self(), SCHED_IDLE, &Parameter); + THROW_ERRNO_IF(Result, Result != 0); + + // + // Gradual mode needs the cgroup memory.reclaim knob; fall back to dropping caches if it is + // not present. + // + + if (Mode == LxMiniInitMemoryReclaimModeGradual && access(RECLAIM_PATH, W_OK) < 0) + { + LOG_WARNING("access({}, W_OK) failed {}, falling back to drop_caches", RECLAIM_PATH, errno); + Mode = LxMiniInitMemoryReclaimModeDropCache; + } + + MemoryReclaimState State; + + for (;;) + { + std::this_thread::sleep_for(c_pollInterval); + + unsigned long long Busy = 0; + unsigned long long Idle = 0; + if (!ReadAggregateCpuTimes(Busy, Idle)) + { + continue; + } + + // + // Two samples are required to compute utilization over the interval. + // + + if (!State.HavePreviousSample) + { + State.PreviousBusy = Busy; + State.PreviousIdle = Idle; + State.HavePreviousSample = true; + continue; + } + + const unsigned long long BusyDelta = Busy - State.PreviousBusy; + const unsigned long long TotalDelta = (Busy + Idle) - (State.PreviousBusy + State.PreviousIdle); + State.PreviousBusy = Busy; + State.PreviousIdle = Idle; + + const bool IntervalIdle = (TotalDelta == 0) || (BusyDelta * 1000 <= TotalDelta * c_busyThresholdPerMille); + + const bool Reclaimed = (Mode == LxMiniInitMemoryReclaimModeGradual) ? RunGradualTick(State, IntervalIdle) + : RunDropCacheTick(State, IntervalIdle); + + RunCompactionTick(State, Reclaimed); + } + } + CATCH_LOG() + }).detach(); +} +CATCH_LOG() From 386378faa6c9fa1ddc2e4927649622f4f9c2d559 Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Thu, 11 Jun 2026 11:07:33 -0700 Subject: [PATCH 2/3] Drive gradual reclaim by memory pressure instead of CPU idle A CPU-bound workload can sit on gigabytes of cold page cache that a CPU-idle check would never reclaim. Read the PSI "some avg10" memory pressure from /proc/pressure/memory and reclaim cold cache toward the fixed floor whenever pressure is low, even while the VM is busy, backing off once the workload starts stalling on memory. A busy interval reclaims at most a bounded step (c_gradualStepBusyBytes) so a large backlog is drained gently; an idle interval drains the full excess at once. When PSI is unavailable (kernel built without CONFIG_PSI), gradual reclaim falls back to gating on CPU idle. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/linux/init/util.cpp | 98 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 90 insertions(+), 8 deletions(-) diff --git a/src/linux/init/util.cpp b/src/linux/init/util.cpp index 3e2628c2f..df951a553 100644 --- a/src/linux/init/util.cpp +++ b/src/linux/init/util.cpp @@ -3737,6 +3737,52 @@ Return Value: return static_cast(Info.freeram) * Info.mem_unit; } +static double GetMemoryPressureAvg10() + +/*++ + +Routine Description: + + This routine returns the PSI "some" 10-second memory pressure average from + /proc/pressure/memory. This is the fraction of time (0-100) that some task + stalled waiting on memory in the last 10 seconds; ~0 means there is slack to + reclaim cold pages without hurting the workload, regardless of CPU activity. + +Arguments: + + None. + +Return Value: + + The "some avg10" pressure value, or -1.0 if PSI is unavailable. + +--*/ + +{ + // + // PSI may be unavailable (kernel built without CONFIG_PSI), which is a normal + // condition the caller handles, so failures are not logged. + // + + char Buffer[256]; + if (!ReadProcFile("/proc/pressure/memory", Buffer, sizeof(Buffer), false)) + { + return -1.0; + } + + // + // The first line is "some avg10= avg60= avg300= total=". + // + + const char* Marker = strstr(Buffer, "avg10="); + if (Marker == nullptr) + { + return -1.0; + } + + return strtod(Marker + (sizeof("avg10=") - 1), nullptr); +} + namespace { // @@ -3759,6 +3805,14 @@ constexpr long long c_cacheGrowthRearmBytes = 256ll * 1024 * 1024; constexpr long long c_floorBaseBytes = 128ll * 1024 * 1024; constexpr long long c_gradualHysteresisBytes = 128ll * 1024 * 1024; +// Gradual is driven by PSI: reclaim cold cache while the "some avg10" memory pressure is below this +// value (in percent), and back off above it. +constexpr double c_pressureReclaimMax = 1.0; + +// While the VM is busy, reclaim at most this much per interval so a large backlog is drained gently; an +// idle interval drains the full excess at once. +constexpr long long c_gradualStepBusyBytes = 256ll * 1024 * 1024; + // Compaction runs once free memory grows by at least this much since the last compaction. constexpr long long c_compactFreeGrowthBytes = 256ll * 1024 * 1024; @@ -3839,9 +3893,11 @@ bool RunGradualTick(MemoryReclaimState& State, bool IntervalIdle) Routine Description: - Runs one interval of gentle reclaim (cold-first via cgroup memory.reclaim). Reclaim is gated on CPU - idle and drains reclaimable page cache down toward a fixed floor, leaving a hysteresis margin so it - does not churn near the floor. + Runs one interval of pressure-driven gentle reclaim (cold-first via cgroup memory.reclaim) toward a + fixed floor. While the kernel reports little/no memory pressure (PSI) there is cold cache the guest is + not really using, so it is reclaimed -- even while the VM is busy. A busy interval reclaims at most a + bounded step so a large backlog is drained gently; an idle interval drains the full excess. When PSI + is unavailable, reclaim falls back to gating on CPU idle. Return Value: @@ -3852,7 +3908,27 @@ Return Value: { (void)State; - if (!IntervalIdle) + const double Pressure = GetMemoryPressureAvg10(); + + bool MayReclaim; + if (Pressure < 0.0) + { + // + // No PSI brake available: gate reclaim on CPU idle. + // + + MayReclaim = IntervalIdle; + } + else + { + // + // Reclaim only while pressure is low; back off once the workload starts stalling on memory. + // + + MayReclaim = Pressure < c_pressureReclaimMax; + } + + if (!MayReclaim) { return false; } @@ -3869,9 +3945,11 @@ Return Value: return false; } + const long long ToFree = IntervalIdle ? Excess : (std::min)(Excess, c_gradualStepBusyBytes); + // Best-effort: RequestCgroupReclaim never throws and does not log the expected EAGAIN, so a transient // write error cannot tear down the long-lived reduction thread. - return RequestCgroupReclaim(Excess); + return RequestCgroupReclaim(ToFree); } bool RunDropCacheTick(MemoryReclaimState& State, bool IntervalIdle) @@ -3972,9 +4050,13 @@ Routine Description: The policy is: - 1. Gradual mode (gentle, cold-first via cgroup memory.reclaim) is gated on sustained CPU idle and - drains reclaimable page cache down toward a fixed floor, leaving a hysteresis margin so it does - not churn near the floor. + 1. Gradual mode (gentle, cold-first via cgroup memory.reclaim) is driven by *memory pressure*, + not CPU idleness. While the kernel reports little/no memory pressure (PSI), there is cold + memory the guest is not really using, so it is reclaimed toward a fixed floor -- even while + the VM is busy. This is important because a CPU-bound workload can sit on gigabytes of cold + cache that a CPU-idle check would never reclaim. A busy interval reclaims at most a bounded + step so a large backlog is drained gently; an idle interval drains the full excess. When PSI + is unavailable, Gradual falls back to reclaiming only while CPU-idle. 2. DropCache mode (the indiscriminate sledgehammer: drop_caches evicts the entire page cache, hot and cold alike) cannot run safely under load, so it stays gated on sustained CPU idle. It From 4e60f356b1f8855099b4e394a668675bf7607ff2 Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Thu, 11 Jun 2026 11:08:39 -0700 Subject: [PATCH 3/3] Adapt the gradual reclaim floor to the working set using refaults Reclaiming toward a single fixed floor either gives back too little on large VMs or evicts pages a larger working set immediately faults back in. Make the floor adaptive and self-regulating: - Track file refaults (/proc/vmstat workingset_refault_file) as a signal that reclaim is cutting into the working set. When refaults spike (or PSI pressure rises into the backoff band), raise the floor to protect what the workload is actually using and stop reclaiming. - After sustained calm, decay the floor back toward the base so a shrunken working set is eventually re-probed downward. - Scale the floor's upper bound to a fraction of guest RAM so large working sets on large VMs can be fully protected, falling back to a fixed cap when total RAM is unavailable. The refault counter is parsed unsigned and clamped so it cannot wrap negative and disable the brake. The PSI-unavailable path keeps the same refault brake and floor decay so behavior degrades gracefully on kernels without CONFIG_PSI. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/linux/init/util.cpp | 275 ++++++++++++++++++++++++++++++++++------ 1 file changed, 234 insertions(+), 41 deletions(-) diff --git a/src/linux/init/util.cpp b/src/linux/init/util.cpp index df951a553..67b43e5b3 100644 --- a/src/linux/init/util.cpp +++ b/src/linux/init/util.cpp @@ -3737,6 +3737,37 @@ Return Value: return static_cast(Info.freeram) * Info.mem_unit; } +static long long GetTotalMemoryBytes() + +/*++ + +Routine Description: + + This routine returns the guest's total RAM (in bytes), used to scale the + adaptive reclaim floor cap relative to the configured VM size rather than a + fixed absolute value. + +Arguments: + + None. + +Return Value: + + Total memory in bytes, or -1 on failure. + +--*/ + +{ + struct sysinfo Info = {}; + if (sysinfo(&Info) < 0) + { + LOG_ERROR("sysinfo failed {}", errno); + return -1; + } + + return static_cast(Info.totalram) * Info.mem_unit; +} + static double GetMemoryPressureAvg10() /*++ @@ -3783,6 +3814,55 @@ Return Value: return strtod(Marker + (sizeof("avg10=") - 1), nullptr); } +static long long GetFileRefaults() + +/*++ + +Routine Description: + + This routine returns the cumulative workingset_refault_file counter from + /proc/vmstat. A rising counter means file pages that were previously evicted + are being faulted back in -- i.e. reclaim (or workload pressure) is cutting + into the live working set, which is used as a brake on further reclaim. + +Arguments: + + None. + +Return Value: + + The cumulative file refault count (in pages), or -1 if unavailable. + +--*/ + +{ + // + // /proc/vmstat is several KB and the counter appears well into the file, so a + // generous buffer (filled across reads by ReadProcFile) avoids truncating it. + // + + char Buffer[16384]; + if (!ReadProcFile("/proc/vmstat", Buffer, sizeof(Buffer))) + { + return -1; + } + + const char* Marker = strstr(Buffer, "workingset_refault_file "); + if (Marker == nullptr) + { + return -1; + } + + // + // The counter is an unsigned cumulative page count; parse it unsigned and clamp to LLONG_MAX so a + // value near the top of the range (or a wrap on a very long-lived system) cannot turn negative and + // disable the refault brake. + // + + const unsigned long long Refaults = strtoull(Marker + (sizeof("workingset_refault_file ") - 1), nullptr, 10); + return static_cast((std::min)(Refaults, static_cast(LLONG_MAX))); +} + namespace { // @@ -3800,17 +3880,31 @@ constexpr unsigned long long c_busyThresholdPerMille = 5; // 0.5% constexpr int c_dropCacheIdleIntervals = 30; // 5 minutes constexpr long long c_cacheGrowthRearmBytes = 256ll * 1024 * 1024; -// Gradual: reclaimable cache below this floor is always retained; only the excess above it (beyond a -// hysteresis margin) is reclaimed. +// Gradual: PSI "some avg10" thresholds. Reclaim only when pressure is below the reclaim ceiling; raise +// the floor and back off when pressure exceeds the backoff floor. The band in between is a dead zone +// that prevents oscillation. +constexpr double c_pressureReclaimMax = 1.0; +constexpr double c_pressureBackoffMin = 5.0; + +// Gradual: back off if more than this many file pages were refaulted in one interval (~40 MB at 4 KB +// pages), indicating reclaim is cutting into the live working set. +constexpr long long c_refaultBackoffPages = 10000; + +// Gradual: adaptive floor of reclaimable cache to retain. It starts small, doubles (up to the cap) +// whenever reclaim hurts, and decays back toward the base only after sustained calm so a learned working +// set is held steady instead of being re-probed (and re-evicted) every tick. The cap is scaled to a +// fraction of guest RAM at startup (see c_floorMaxRam*), so large working sets on large VMs can be fully +// protected; the fallback applies only if total RAM cannot be determined. constexpr long long c_floorBaseBytes = 128ll * 1024 * 1024; +constexpr long long c_floorMaxFallbackBytes = 4096ll * 1024 * 1024; +constexpr long long c_floorMaxRamNumerator = 3; +constexpr long long c_floorMaxRamDenominator = 4; +constexpr long long c_floorDecayBytes = 64ll * 1024 * 1024; +constexpr int c_floorDecayAfterCalmTicks = 6; // 60s constexpr long long c_gradualHysteresisBytes = 128ll * 1024 * 1024; -// Gradual is driven by PSI: reclaim cold cache while the "some avg10" memory pressure is below this -// value (in percent), and back off above it. -constexpr double c_pressureReclaimMax = 1.0; - -// While the VM is busy, reclaim at most this much per interval so a large backlog is drained gently; an -// idle interval drains the full excess at once. +// Gradual: when the VM is busy, reclaim at most this much per interval (cautious, so a burst of refaults +// can brake it before too much is evicted); when idle, drain the full excess. constexpr long long c_gradualStepBusyBytes = 256ll * 1024 * 1024; // Compaction runs once free memory grows by at least this much since the last compaction. @@ -3832,10 +3926,55 @@ struct MemoryReclaimState bool ReclaimedThisIdlePeriod = false; long long CacheAtLastDrop = 0; + // Gradual. + long long FloorBytes = c_floorBaseBytes; + long long FloorMaxBytes = c_floorMaxFallbackBytes; + long long PreviousRefaults = -1; + int CalmStreak = 0; + // Compaction. long long FreeAtLastCompaction = 0; }; +void RaiseFloorToProtect(MemoryReclaimState& State, long long Cache) + +/*++ + +Routine Description: + + Reclaim is hurting (real pressure, or the workload is refaulting evicted file pages). Rather than + ramping the floor up over many thrashing ticks, this immediately raises it to protect the cache that + is actually in use: it jumps to at least the current reclaimable cache (plus a margin) so reclaim + stops cutting into the working set on this very tick. The slow calm-time decay then re-probes downward + gently, so an over-estimate self-corrects without re-thrashing. + +--*/ + +{ + const long long Protect = (Cache < 0) ? (State.FloorBytes * 2) : (Cache + c_gradualHysteresisBytes); + State.FloorBytes = (std::min)(State.FloorMaxBytes, (std::max)(State.FloorBytes * 2, Protect)); + State.CalmStreak = 0; +} + +void DecayFloorAfterCalm(MemoryReclaimState& State) + +/*++ + +Routine Description: + + Relaxes the floor toward the base, but only after sustained calm, so a learned working set is not + immediately re-probed (and re-evicted) the moment pressure subsides. + +--*/ + +{ + State.CalmStreak += 1; + if (State.CalmStreak >= c_floorDecayAfterCalmTicks && State.FloorBytes > c_floorBaseBytes) + { + State.FloorBytes = (std::max)(c_floorBaseBytes, State.FloorBytes - c_floorDecayBytes); + } +} + bool RequestCgroupReclaim(long long Bytes) /*++ @@ -3893,11 +4032,9 @@ bool RunGradualTick(MemoryReclaimState& State, bool IntervalIdle) Routine Description: - Runs one interval of pressure-driven gentle reclaim (cold-first via cgroup memory.reclaim) toward a - fixed floor. While the kernel reports little/no memory pressure (PSI) there is cold cache the guest is - not really using, so it is reclaimed -- even while the VM is busy. A busy interval reclaims at most a - bounded step so a large backlog is drained gently; an idle interval drains the full excess. When PSI - is unavailable, reclaim falls back to gating on CPU idle. + Runs one interval of pressure-driven gentle reclaim (cold-first via cgroup memory.reclaim) toward the + adaptive floor. The refault rate is refreshed every interval so it reflects current activity + regardless of whether reclaim runs. Return Value: @@ -3906,50 +4043,93 @@ Return Value: --*/ { - (void)State; - const double Pressure = GetMemoryPressureAvg10(); + const long long Refaults = GetFileRefaults(); + const long long RefaultDelta = (Refaults >= 0 && State.PreviousRefaults >= 0) ? (Refaults - State.PreviousRefaults) : 0; + if (Refaults >= 0) + { + State.PreviousRefaults = Refaults; + } + + const bool RefaultBrake = RefaultDelta >= c_refaultBackoffPages; + const long long Cache = GetReclaimableCacheBytes(); bool MayReclaim; if (Pressure < 0.0) { // - // No PSI brake available: gate reclaim on CPU idle. + // No PSI brake available: gate reclaim on CPU idle, but still honor the refault brake so reclaim + // backs off (and the floor is raised to protect the working set) if it starts faulting evicted + // file pages back in. On calm idle intervals decay the floor as in the PSI path so a learned + // working set is eventually re-probed downward. + // + + if (RefaultBrake) + { + RaiseFloorToProtect(State, Cache); + MayReclaim = false; + } + else if (IntervalIdle) + { + DecayFloorAfterCalm(State); + MayReclaim = true; + } + else + { + State.CalmStreak = 0; + MayReclaim = false; + } + } + else if (Pressure >= c_pressureBackoffMin || RefaultBrake) + { + RaiseFloorToProtect(State, Cache); + MayReclaim = false; + } + else if (Pressure >= c_pressureReclaimMax) + { + // + // Dead zone: some pressure but not enough to back off. Hold steady. // - MayReclaim = IntervalIdle; + State.CalmStreak = 0; + MayReclaim = false; } else { // - // Reclaim only while pressure is low; back off once the workload starts stalling on memory. + // Calm: relax the floor toward the base after sustained calm, then reclaim cold excess. // - MayReclaim = Pressure < c_pressureReclaimMax; + DecayFloorAfterCalm(State); + MayReclaim = true; } - if (!MayReclaim) + bool Reclaimed = false; + if (MayReclaim && Cache >= 0) { - return false; - } + const long long Excess = Cache - State.FloorBytes; + if (Excess > c_gradualHysteresisBytes) + { + const long long ToFree = IntervalIdle ? Excess : (std::min)(Excess, c_gradualStepBusyBytes); - const long long Cache = GetReclaimableCacheBytes(); - if (Cache < 0) - { - return false; - } + // Best-effort: RequestCgroupReclaim never throws and does not log the expected EAGAIN, so a + // transient write error cannot tear down the long-lived reduction thread. + Reclaimed = RequestCgroupReclaim(ToFree); - const long long Excess = Cache - c_floorBaseBytes; - if (Excess <= c_gradualHysteresisBytes) - { - return false; - } + // + // Reset the refault baseline to post-reclaim so the next interval's delta attributes refaults + // to this reclaim. + // - const long long ToFree = IntervalIdle ? Excess : (std::min)(Excess, c_gradualStepBusyBytes); + const long long After = GetFileRefaults(); + if (After >= 0) + { + State.PreviousRefaults = After; + } + } + } - // Best-effort: RequestCgroupReclaim never throws and does not log the expected EAGAIN, so a transient - // write error cannot tear down the long-lived reduction thread. - return RequestCgroupReclaim(ToFree); + return Reclaimed; } bool RunDropCacheTick(MemoryReclaimState& State, bool IntervalIdle) @@ -4052,11 +4232,12 @@ Routine Description: 1. Gradual mode (gentle, cold-first via cgroup memory.reclaim) is driven by *memory pressure*, not CPU idleness. While the kernel reports little/no memory pressure (PSI), there is cold - memory the guest is not really using, so it is reclaimed toward a fixed floor -- even while - the VM is busy. This is important because a CPU-bound workload can sit on gigabytes of cold - cache that a CPU-idle check would never reclaim. A busy interval reclaims at most a bounded - step so a large backlog is drained gently; an idle interval drains the full excess. When PSI - is unavailable, Gradual falls back to reclaiming only while CPU-idle. + memory the guest is not really using, so it is reclaimed toward an adaptive floor -- even + while the VM is busy. This is important because a CPU-bound workload can sit on gigabytes of + cold cache that a CPU-idle check would never reclaim. The amount of file refaults is used as a + brake: if reclaim starts faulting evicted file pages back in (or pressure rises), the floor is + raised and reclaim backs off, so the loop self-regulates to the true working-set size. When + PSI is unavailable, Gradual falls back to reclaiming only while CPU-idle. 2. DropCache mode (the indiscriminate sledgehammer: drop_caches evicts the entire page cache, hot and cold alike) cannot run safely under load, so it stays gated on sustained CPU idle. It @@ -4112,6 +4293,17 @@ try MemoryReclaimState State; + // + // Scale the adaptive floor cap to a fraction of guest RAM so large working sets on large VMs + // can be fully protected, falling back to a fixed cap only if total RAM is unavailable. + // + + const long long TotalRam = GetTotalMemoryBytes(); + if (TotalRam > 0) + { + State.FloorMaxBytes = (std::max)(c_floorBaseBytes, (TotalRam * c_floorMaxRamNumerator) / c_floorMaxRamDenominator); + } + for (;;) { std::this_thread::sleep_for(c_pollInterval); @@ -4132,6 +4324,7 @@ try State.PreviousBusy = Busy; State.PreviousIdle = Idle; State.HavePreviousSample = true; + State.PreviousRefaults = GetFileRefaults(); continue; }