From 12dd1900a37d6f40cb7c3b93de00437d1bdeb8a9 Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Thu, 11 Jun 2026 10:17:09 -0700 Subject: [PATCH 1/4] Rework the memory reduction thread around explicit reclaim helpers Replace the ring-buffer idle detector and user-CPU-only sampling in the mini-init memory reduction thread with a clearer, helper-based design: - Sample aggregate non-idle CPU time (user, system, irq, softirq, steal) so kernel-bound work keeps the VM out of the idle state, instead of looking at user time alone. - GetReclaimableCacheBytes / GetFreeMemoryBytes read the relevant procfs counters via a small ReadProcFile helper. - Gradual mode reclaims cold page cache (cgroup memory.reclaim) above a fixed floor while CPU-idle, with a hysteresis margin so it does not churn near the floor. - DropCache mode stays gated on sustained CPU idle, drops once, and re-drops only after the reclaimable cache grows meaningfully. - Compaction is gated on free-memory growth so it runs only when there are newly-freed pages worth coalescing. All procfs writes are best-effort and never tear down the long-lived reduction thread on a transient error. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/linux/init/util.cpp | 525 ++++++++++++++++++++++++++++++---------- 1 file changed, 402 insertions(+), 123 deletions(-) diff --git a/src/linux/init/util.cpp b/src/linux/init/util.cpp index 962e426a2..705f556cf 100644 --- a/src/linux/init/util.cpp +++ b/src/linux/init/util.cpp @@ -24,6 +24,7 @@ Module Name: #include #include #include +#include #include #include #include @@ -3505,71 +3506,210 @@ int ProcessCreateProcessMessage(wsl::shared::Transaction& Transaction, gsl::span #define RECLAIM_PATH CGROUP_MOUNTPOINT "/memory.reclaim" -static long long int GetUserCpuTime() +static bool ReadProcFile(const char* Path, char* Buffer, size_t Size, bool LogErrors = true) /*++ Routine Description: - This routine parses /proc/stat to query a summary of all user CPU time. + This routine reads the full contents of a procfs file into a caller-supplied + NUL-terminated buffer. Because procfs content is generated on read and a + single read() may return only a partial snapshot, this loops until EOF (or + the buffer fills), which keeps later-appearing fields (for example the + workingset counters deep in /proc/vmstat) from being truncated away. Arguments: - None. + Path - Supplies the procfs path to read. + + Buffer - Supplies the buffer to fill; always NUL-terminated on success. + + Size - Supplies the size of Buffer in bytes (must be at least 1). + + LogErrors - Supplies whether open/read failures are logged. Callers for + which absence is normal (for example PSI) pass false. Return Value: - The current user CPU counter for all cores. + true on success (Buffer holds the content), false on failure. --*/ { - wil::unique_fd Fd{open("/proc/stat", O_RDONLY)}; + wil::unique_fd Fd{open(Path, O_RDONLY)}; if (!Fd) { - LOG_ERROR("open failed {}", errno); - return -1; + if (LogErrors) + { + LOG_ERROR("open({}) failed {}", Path, errno); + } + + return false; } - char Buffer[32]; - int Result = TEMP_FAILURE_RETRY(read(Fd.get(), Buffer, (sizeof(Buffer) - 1))); - if (Result <= 0) + size_t Total = 0; + while (Total < (Size - 1)) { - LOG_ERROR("read failed {}", errno); - return -1; + const int Result = TEMP_FAILURE_RETRY(read(Fd.get(), Buffer + Total, (Size - 1) - Total)); + if (Result < 0) + { + if (LogErrors) + { + LOG_ERROR("read({}) failed {}", Path, errno); + } + + return false; + } + + if (Result == 0) + { + break; + } + + Total += Result; + } + + Buffer[Total] = '\0'; + return Total > 0; +} + +static bool ReadAggregateCpuTimes(unsigned long long& Busy, unsigned long long& Idle) + +/*++ + +Routine Description: + + This routine parses the aggregate "cpu" line of /proc/stat and splits the + cumulative jiffies into busy and idle buckets. + + Unlike a naive "user time only" measurement, idle time is taken as idle + + iowait and everything else (user, nice, system, irq, softirq, steal) counts + as busy. This ensures kernel-bound work -- background daemons, I/O, niced + builds -- correctly keeps the VM out of the idle state. + +Arguments: + + Busy - Receives the cumulative busy jiffies across all cores. + + Idle - Receives the cumulative idle jiffies (idle + iowait) across all cores. + +Return Value: + + true on success, false on failure. + +--*/ + +{ + // + // The aggregate cpu line easily fits in this buffer. + // + + char Buffer[256]; + if (!ReadProcFile("/proc/stat", Buffer, sizeof(Buffer))) + { + return false; } // - // Parse the first line of /proc/stat which is in the format - // "cpu ". + // Format: "cpu user nice system idle iowait irq softirq steal guest guest_nice". + // The guest fields are already accounted for in user/nice and are ignored here. // - Buffer[Result] = '\0'; - char* Sp1; - char* Info = strtok_r(Buffer, " \n", &Sp1); - if (Info == nullptr) + unsigned long long Fields[8] = {}; + const int Parsed = sscanf( + Buffer, "cpu %llu %llu %llu %llu %llu %llu %llu %llu", &Fields[0], &Fields[1], &Fields[2], &Fields[3], &Fields[4], &Fields[5], &Fields[6], &Fields[7]); + + if (Parsed < 5) { - LOG_ERROR("/proc/stat first line missing cpu label"); - return -1; + LOG_ERROR("failed to parse /proc/stat cpu line (parsed {})", Parsed); + return false; + } + + Idle = Fields[3] + Fields[4]; + Busy = 0; + for (int Index = 0; Index < Parsed; Index += 1) + { + if (Index != 3 && Index != 4) + { + Busy += Fields[Index]; + } } - Info = strtok_r(nullptr, " \n", &Sp1); - if (Info == nullptr) + return true; +} + +static long long GetReclaimableCacheBytes() + +/*++ + +Routine Description: + + This routine returns the amount of reclaimable file-backed page cache (in + bytes) by parsing /proc/meminfo. + + It deliberately counts only memory that cache reclaim can actually return to + the host: file-backed page cache (Active(file) + Inactive(file)) plus + reclaimable slab (SReclaimable). Anonymous memory is excluded because neither + drop_caches nor cgroup reclaim of clean cache can free it, so it must not + drive the reclaim trigger. + +Arguments: + + None. + +Return Value: + + Reclaimable cache in bytes, or -1 on failure. + +--*/ + +{ + char Buffer[4096]; + if (!ReadProcFile("/proc/meminfo", Buffer, sizeof(Buffer))) { - LOG_ERROR("/proc/stat first line missing cpu counter"); return -1; } - return strtoll(Info, nullptr, 10); + // + // All values in /proc/meminfo are reported in kB. + // + + long long TotalKb = 0; + char* Save = nullptr; + for (char* Line = strtok_r(Buffer, "\n", &Save); Line != nullptr; Line = strtok_r(nullptr, "\n", &Save)) + { + const char* Value = nullptr; + if (strncmp(Line, "Active(file):", 13) == 0) + { + Value = Line + 13; + } + else if (strncmp(Line, "Inactive(file):", 15) == 0) + { + Value = Line + 15; + } + else if (strncmp(Line, "SReclaimable:", 13) == 0) + { + Value = Line + 13; + } + + if (Value != nullptr) + { + TotalKb += strtoll(Value, nullptr, 10); + } + } + + return TotalKb * 1024; } -static ssize_t GetMemoryInUse() +static long long GetFreeMemoryBytes() /*++ Routine Description: - This routine returns the amount memory in use in bytes. + This routine returns the amount of free memory (in bytes) currently held in + the buddy allocator, used to decide when there are newly-freed pages worth + compacting -- whether they were freed by reclaim or by a process exiting. Arguments: @@ -3577,17 +3717,197 @@ Routine Description: Return Value: - Total memory - Free memory. Includes that used by cache and buffers. + Free memory in bytes, or -1 on failure. --*/ -try { struct sysinfo Info = {}; - THROW_LAST_ERROR_IF(sysinfo(&Info) < 0); - return (Info.totalram - Info.freeram) * Info.mem_unit; + if (sysinfo(&Info) < 0) + { + LOG_ERROR("sysinfo failed {}", errno); + return -1; + } + + return static_cast(Info.freeram) * Info.mem_unit; } -CATCH_RETURN_ERRNO() + +namespace { + +// +// Tunables for the memory reduction thread. +// + +constexpr auto c_pollInterval = std::chrono::seconds(10); + +// An interval is CPU-idle when less than this fraction (per-mille) of aggregate CPU time was spent on +// non-idle work. +constexpr unsigned long long c_busyThresholdPerMille = 5; // 0.5% + +// DropCache: drop only after this many consecutive idle intervals, then re-drop once the reclaimable +// cache grows by at least the re-arm threshold. +constexpr int c_dropCacheIdleIntervals = 30; // 5 minutes +constexpr long long c_cacheGrowthRearmBytes = 256ll * 1024 * 1024; + +// Gradual: reclaimable cache below this floor is always retained; only the excess above it (beyond a +// hysteresis margin) is reclaimed. +constexpr long long c_floorBaseBytes = 128ll * 1024 * 1024; +constexpr long long c_gradualHysteresisBytes = 128ll * 1024 * 1024; + +// Compaction runs once free memory grows by at least this much since the last compaction. +constexpr long long c_compactFreeGrowthBytes = 256ll * 1024 * 1024; + +// +// Mutable state carried across polling intervals by the reduction thread. +// + +struct MemoryReclaimState +{ + // CPU sampling. + unsigned long long PreviousBusy = 0; + unsigned long long PreviousIdle = 0; + bool HavePreviousSample = false; + + // DropCache. + int IdleStreak = 0; + bool ReclaimedThisIdlePeriod = false; + long long CacheAtLastDrop = 0; + + // Compaction. + long long FreeAtLastCompaction = 0; +}; + +bool RunGradualTick(MemoryReclaimState& State, bool IntervalIdle) + +/*++ + +Routine Description: + + Runs one interval of gentle reclaim (cold-first via cgroup memory.reclaim). Reclaim is gated on CPU + idle and drains reclaimable page cache down toward a fixed floor, leaving a hysteresis margin so it + does not churn near the floor. + +Return Value: + + true if memory was reclaimed this interval, false otherwise. + +--*/ + +{ + (void)State; + + if (!IntervalIdle) + { + return false; + } + + const long long Cache = GetReclaimableCacheBytes(); + if (Cache < 0) + { + return false; + } + + const long long Excess = Cache - c_floorBaseBytes; + if (Excess <= c_gradualHysteresisBytes) + { + return false; + } + + const std::string Bytes = std::to_string(Excess); + + // Best-effort: WriteToFile logs internally on failure. EAGAIN merely means the kernel could not evict + // the full amount this pass (pages were still freed), so it counts as reclaim. Never throw on a + // transient write error and tear down the long-lived reduction thread. + const int Status = WriteToFile(RECLAIM_PATH, Bytes.c_str()); + return (Status == 0) || (errno == EAGAIN); +} + +bool RunDropCacheTick(MemoryReclaimState& State, bool IntervalIdle) + +/*++ + +Routine Description: + + Runs one interval of DropCache policy: gated on sustained CPU idle because drop_caches is + indiscriminate (it evicts hot and cold pages alike). Drops once on becoming idle, then re-drops only + after the reclaimable cache grows meaningfully. + +Return Value: + + true if the cache was dropped this interval, false otherwise. + +--*/ + +{ + if (!IntervalIdle) + { + State.IdleStreak = 0; + State.ReclaimedThisIdlePeriod = false; + return false; + } + + if (++State.IdleStreak < c_dropCacheIdleIntervals) + { + return false; + } + + const long long Cache = GetReclaimableCacheBytes(); + if (Cache >= 0 && (!State.ReclaimedThisIdlePeriod || Cache > State.CacheAtLastDrop + c_cacheGrowthRearmBytes)) + { + // Best-effort; WriteToFile logs internally on failure. A failed drop must not tear down the + // long-lived reduction thread. + if (WriteToFile("/proc/sys/vm/drop_caches", "1\n") == 0) + { + const long long After = GetReclaimableCacheBytes(); + State.CacheAtLastDrop = (After < 0) ? 0 : After; + State.ReclaimedThisIdlePeriod = true; + return true; + } + } + + return false; +} + +void RunCompactionTick(MemoryReclaimState& State, bool Reclaimed) + +/*++ + +Routine Description: + + Compacts when there are newly-freed pages worth coalescing -- from our reclaim or from a process + exiting -- so free-page reporting can hand back large blocks. Tracks downward movement of free memory + so a later rise re-triggers compaction. + +--*/ + +{ + const long long Free = GetFreeMemoryBytes(); + bool Compact = Reclaimed; + if (Free >= 0) + { + if (Free > State.FreeAtLastCompaction + c_compactFreeGrowthBytes) + { + Compact = true; + } + + if (Free < State.FreeAtLastCompaction) + { + State.FreeAtLastCompaction = Free; + } + } + + if (Compact) + { + // Best-effort; WriteToFile logs internally on failure. A failed compaction must not tear down the + // long-lived reduction thread; leave FreeAtLastCompaction unchanged so the next tick retries. + if (WriteToFile("/proc/sys/vm/compact_memory", "1\n") == 0 && Free >= 0) + { + State.FreeAtLastCompaction = Free; + } + } +} + +} // namespace void StartMemoryReductionThread(LX_MINI_INIT_MEMORY_RECLAIM_MODE Mode) @@ -3595,8 +3915,26 @@ void StartMemoryReductionThread(LX_MINI_INIT_MEMORY_RECLAIM_MODE Mode) Routine Description: - This routine starts a background thread that performs memory compaction and optional cache/memory - reclaim when the VM is idle. This ensures that the maximum number of pages can be discarded to the host. + This routine starts a background thread that reclaims memory and compacts free pages so the maximum + number of pages can be discarded back to the host. + + The policy is: + + 1. Gradual mode (gentle, cold-first via cgroup memory.reclaim) is gated on sustained CPU idle and + drains reclaimable page cache down toward a fixed floor, leaving a hysteresis margin so it does + not churn near the floor. + + 2. DropCache mode (the indiscriminate sledgehammer: drop_caches evicts the entire page cache, + hot and cold alike) cannot run safely under load, so it stays gated on sustained CPU idle. It + drops once on becoming idle, then re-drops only after the cache grows meaningfully. + + 3. Compaction is gated on free-memory *growth*: it runs when there are newly-freed pages worth + coalescing -- whether they were freed by our reclaim or by a process exiting -- and is skipped + on ticks where nothing new was freed. This both avoids the previous "compact every tick" waste + and ensures naturally-freed memory still gets coalesced for efficient page reporting. + + CPU utilization is measured over each interval using all non-idle CPU time (user, system, irq, + softirq, steal) rather than just user time, so kernel-bound work keeps the VM out of the idle state. Arguments: @@ -3610,11 +3948,16 @@ Return Value: try { + if (Mode == LxMiniInitMemoryReclaimModeDisabled) + { + return; + } + std::thread([Mode = Mode]() mutable { try { // - // Set the thread's scheduling policy to idle. + // Run at idle scheduling priority so reclaim never competes with real work. // sched_param Parameter{}; @@ -3623,25 +3966,8 @@ try THROW_ERRNO_IF(Result, Result != 0); // - // Periodically check if the machine is idle by querying procfs for CPU usage. - // Memory compaction will occur if both of the following conditions are true: - // 1. The CPU time since the last check is greater than the idle threshold. - // 2. The current CPU usage is below the idle threshold. This is measured by taking two readings one second apart. - // - - double MemoryLow = 1024 * 1024 * 1024; - double MemoryHigh = 1.1 * 1024.0 * 1024.0 * 1024.0; - const int IdleThreshold = get_nprocs(); // Change math to adjust if sysconf(_SC_CLK_TCK) != 100? Is 1% - long long int Start, Stop = 0; - auto constexpr SleepDuration = std::chrono::seconds(30); - size_t ReclaimIndex = 0; - long long int const ReclaimThreshold = (get_nprocs() * sysconf(_SC_CLK_TCK) * SleepDuration / std::chrono::seconds(1)) / 200; // 0.5% - long long int ReclaimWindow[20] = {}; // 10 minutes - long long int ReclaimWindowLength = COUNT_OF(ReclaimWindow); - bool ReclaimIdling = false; - - // - // Fall back to drop cache if the required cgroup path is not present. + // Gradual mode needs the cgroup memory.reclaim knob; fall back to dropping caches if it is + // not present. // if (Mode == LxMiniInitMemoryReclaimModeGradual && access(RECLAIM_PATH, W_OK) < 0) @@ -3650,89 +3976,42 @@ try Mode = LxMiniInitMemoryReclaimModeDropCache; } - if (Mode == LxMiniInitMemoryReclaimModeGradual) - { - static_assert(COUNT_OF(ReclaimWindow) >= 6); - ReclaimWindowLength = 6; // Set to 3 minutes. - } - - for (auto i = 1; i < ReclaimWindowLength; i++) - { - ReclaimWindow[i] = LLONG_MIN; - } + MemoryReclaimState State; - std::this_thread::sleep_for(SleepDuration); for (;;) { - auto const Target = std::chrono::steady_clock::now() + SleepDuration; - Start = GetUserCpuTime(); - THROW_LAST_ERROR_IF(Start == -1); + std::this_thread::sleep_for(c_pollInterval); - if (Mode != LxMiniInitMemoryReclaimModeDisabled) + unsigned long long Busy = 0; + unsigned long long Idle = 0; + if (!ReadAggregateCpuTimes(Busy, Idle)) { - // - // Ensure that utilization is below 0.5% from the last 30 seconds, and last n minutes, of usage. - // - - size_t const LastIndex = (ReclaimIndex + 1) % ReclaimWindowLength; - if ((ReclaimWindow[LastIndex] > Start - ReclaimThreshold * (ReclaimWindowLength + 1)) && - (ReclaimWindow[ReclaimIndex] > Start - ReclaimThreshold)) - { - if (Mode == LxMiniInitMemoryReclaimModeGradual) - { - double MemorySize = GetMemoryInUse(); - THROW_LAST_ERROR_IF(MemorySize < 0); - - if (MemorySize > MemoryHigh) - { - ReclaimIdling = false; - } - - if (!ReclaimIdling && MemorySize > MemoryLow) - { - double MemoryTargetSize = MemorySize * 0.97; - std::string MemoryToFree = std::to_string(size_t(MemorySize - MemoryTargetSize)); - // EAGAIN Means that it attempted, but was unable to evict sufficient pages. - THROW_LAST_ERROR_IF(WriteToFile(RECLAIM_PATH, MemoryToFree.c_str()) < 0 && errno != EAGAIN); - - if (MemoryTargetSize < MemoryLow) - { - ReclaimIdling = true; - } - } - } - else if (!ReclaimIdling) - { - ReclaimIdling = true; - THROW_LAST_ERROR_IF(WriteToFile("/proc/sys/vm/drop_caches", "1\n") < 0); - } - } - else - { - ReclaimIdling = false; - } - - ReclaimIndex = LastIndex; - ReclaimWindow[ReclaimIndex] = Start; + continue; } // - // Perform memory compaction if the VM is idle. - // This coalesces free pages into larger blocks for more efficient page reporting. + // Two samples are required to compute utilization over the interval. // - if ((Start - Stop) > IdleThreshold) + if (!State.HavePreviousSample) { - std::this_thread::sleep_for(std::chrono::seconds(1)); - Stop = GetUserCpuTime(); - THROW_LAST_ERROR_IF(Stop == -1); - if ((Stop - Start) < IdleThreshold) - { - THROW_LAST_ERROR_IF(WriteToFile("/proc/sys/vm/compact_memory", "1\n") < 0); - } + State.PreviousBusy = Busy; + State.PreviousIdle = Idle; + State.HavePreviousSample = true; + continue; } - std::this_thread::sleep_until(Target); + const unsigned long long BusyDelta = Busy - State.PreviousBusy; + const unsigned long long TotalDelta = (Busy + Idle) - (State.PreviousBusy + State.PreviousIdle); + State.PreviousBusy = Busy; + State.PreviousIdle = Idle; + + const bool IntervalIdle = (TotalDelta == 0) || (BusyDelta * 1000 <= TotalDelta * c_busyThresholdPerMille); + + const bool Reclaimed = (Mode == LxMiniInitMemoryReclaimModeGradual) ? RunGradualTick(State, IntervalIdle) + : RunDropCacheTick(State, IntervalIdle); + + RunCompactionTick(State, Reclaimed); } } CATCH_LOG() From e0d56824f88eed15b852409ee2cf0cc2ccf12d10 Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Thu, 11 Jun 2026 10:18:57 -0700 Subject: [PATCH 2/4] Drive gradual reclaim by memory pressure instead of CPU idle A CPU-bound workload can sit on gigabytes of cold page cache that a CPU-idle check would never reclaim. Read the PSI "some avg10" memory pressure from /proc/pressure/memory and reclaim cold cache toward the fixed floor whenever pressure is low, even while the VM is busy, backing off once the workload starts stalling on memory. A busy interval reclaims at most a bounded step (c_gradualStepBusyBytes) so a large backlog is drained gently; an idle interval drains the full excess at once. When PSI is unavailable (kernel built without CONFIG_PSI), gradual reclaim falls back to gating on CPU idle. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/linux/init/util.cpp | 97 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 89 insertions(+), 8 deletions(-) diff --git a/src/linux/init/util.cpp b/src/linux/init/util.cpp index 705f556cf..296d6b122 100644 --- a/src/linux/init/util.cpp +++ b/src/linux/init/util.cpp @@ -3732,6 +3732,52 @@ Return Value: return static_cast(Info.freeram) * Info.mem_unit; } +static double GetMemoryPressureAvg10() + +/*++ + +Routine Description: + + This routine returns the PSI "some" 10-second memory pressure average from + /proc/pressure/memory. This is the fraction of time (0-100) that some task + stalled waiting on memory in the last 10 seconds; ~0 means there is slack to + reclaim cold pages without hurting the workload, regardless of CPU activity. + +Arguments: + + None. + +Return Value: + + The "some avg10" pressure value, or -1.0 if PSI is unavailable. + +--*/ + +{ + // + // PSI may be unavailable (kernel built without CONFIG_PSI), which is a normal + // condition the caller handles, so failures are not logged. + // + + char Buffer[256]; + if (!ReadProcFile("/proc/pressure/memory", Buffer, sizeof(Buffer), false)) + { + return -1.0; + } + + // + // The first line is "some avg10= avg60= avg300= total=". + // + + const char* Marker = strstr(Buffer, "avg10="); + if (Marker == nullptr) + { + return -1.0; + } + + return strtod(Marker + (sizeof("avg10=") - 1), nullptr); +} + namespace { // @@ -3754,6 +3800,14 @@ constexpr long long c_cacheGrowthRearmBytes = 256ll * 1024 * 1024; constexpr long long c_floorBaseBytes = 128ll * 1024 * 1024; constexpr long long c_gradualHysteresisBytes = 128ll * 1024 * 1024; +// Gradual is driven by PSI: reclaim cold cache while the "some avg10" memory pressure is below this +// value (in percent), and back off above it. +constexpr double c_pressureReclaimMax = 1.0; + +// While the VM is busy, reclaim at most this much per interval so a large backlog is drained gently; +// an idle interval drains the full excess at once. +constexpr long long c_gradualStepBusyBytes = 256ll * 1024 * 1024; + // Compaction runs once free memory grows by at least this much since the last compaction. constexpr long long c_compactFreeGrowthBytes = 256ll * 1024 * 1024; @@ -3783,9 +3837,11 @@ bool RunGradualTick(MemoryReclaimState& State, bool IntervalIdle) Routine Description: - Runs one interval of gentle reclaim (cold-first via cgroup memory.reclaim). Reclaim is gated on CPU - idle and drains reclaimable page cache down toward a fixed floor, leaving a hysteresis margin so it - does not churn near the floor. + Runs one interval of pressure-driven gentle reclaim (cold-first via cgroup memory.reclaim) toward a + fixed floor. While the kernel reports little/no memory pressure (PSI) there is cold cache the guest is + not really using, so it is reclaimed -- even while the VM is busy. A busy interval reclaims at most a + bounded step so a large backlog is drained gently; an idle interval drains the full excess. When PSI + is unavailable, reclaim falls back to gating on CPU idle. Return Value: @@ -3796,7 +3852,27 @@ Return Value: { (void)State; - if (!IntervalIdle) + const double Pressure = GetMemoryPressureAvg10(); + + bool MayReclaim; + if (Pressure < 0.0) + { + // + // No PSI brake available: gate reclaim on CPU idle. + // + + MayReclaim = IntervalIdle; + } + else + { + // + // Reclaim only while pressure is low; back off once the workload starts stalling on memory. + // + + MayReclaim = Pressure < c_pressureReclaimMax; + } + + if (!MayReclaim) { return false; } @@ -3813,7 +3889,8 @@ Return Value: return false; } - const std::string Bytes = std::to_string(Excess); + const long long ToFree = IntervalIdle ? Excess : (std::min)(Excess, c_gradualStepBusyBytes); + const std::string Bytes = std::to_string(ToFree); // Best-effort: WriteToFile logs internally on failure. EAGAIN merely means the kernel could not evict // the full amount this pass (pages were still freed), so it counts as reclaim. Never throw on a @@ -3920,9 +3997,13 @@ Routine Description: The policy is: - 1. Gradual mode (gentle, cold-first via cgroup memory.reclaim) is gated on sustained CPU idle and - drains reclaimable page cache down toward a fixed floor, leaving a hysteresis margin so it does - not churn near the floor. + 1. Gradual mode (gentle, cold-first via cgroup memory.reclaim) is driven by *memory pressure*, + not CPU idleness. While the kernel reports little/no memory pressure (PSI), there is cold + memory the guest is not really using, so it is reclaimed toward a fixed floor -- even while + the VM is busy. This is important because a CPU-bound workload can sit on gigabytes of cold + cache that a CPU-idle check would never reclaim. A busy interval reclaims at most a bounded + step so a large backlog is drained gently; an idle interval drains the full excess. When PSI + is unavailable, Gradual falls back to reclaiming only while CPU-idle. 2. DropCache mode (the indiscriminate sledgehammer: drop_caches evicts the entire page cache, hot and cold alike) cannot run safely under load, so it stays gated on sustained CPU idle. It From 3c2691bc1ab5b1745e452724c5d46cbb955d6d14 Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Thu, 11 Jun 2026 10:19:40 -0700 Subject: [PATCH 3/4] Adapt the gradual reclaim floor to the working set using refaults Reclaiming toward a single fixed floor either gives back too little on large VMs or evicts pages a larger working set immediately faults back in. Make the floor adaptive and self-regulating: - Track file refaults (/proc/vmstat workingset_refault*) as a signal that reclaim is cutting into the working set. When refaults spike (or PSI pressure rises into the backoff band), raise the floor to protect what the workload is actually using and stop reclaiming. - After sustained calm, decay the floor back toward the base so a shrunken working set is eventually re-probed downward. - Scale the floor's upper bound to a fraction of guest RAM so large working sets on large VMs can be fully protected, falling back to a fixed cap when total RAM is unavailable. The PSI-unavailable path keeps the same refault brake and floor decay so behavior degrades gracefully on kernels without CONFIG_PSI. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/linux/init/util.cpp | 274 +++++++++++++++++++++++++++++++++------- 1 file changed, 230 insertions(+), 44 deletions(-) diff --git a/src/linux/init/util.cpp b/src/linux/init/util.cpp index 296d6b122..414f399fc 100644 --- a/src/linux/init/util.cpp +++ b/src/linux/init/util.cpp @@ -3732,6 +3732,37 @@ Return Value: return static_cast(Info.freeram) * Info.mem_unit; } +static long long GetTotalMemoryBytes() + +/*++ + +Routine Description: + + This routine returns the guest's total RAM (in bytes), used to scale the + adaptive reclaim floor cap relative to the configured VM size rather than a + fixed absolute value. + +Arguments: + + None. + +Return Value: + + Total memory in bytes, or -1 on failure. + +--*/ + +{ + struct sysinfo Info = {}; + if (sysinfo(&Info) < 0) + { + LOG_ERROR("sysinfo failed {}", errno); + return -1; + } + + return static_cast(Info.totalram) * Info.mem_unit; +} + static double GetMemoryPressureAvg10() /*++ @@ -3778,6 +3809,48 @@ Return Value: return strtod(Marker + (sizeof("avg10=") - 1), nullptr); } +static long long GetFileRefaults() + +/*++ + +Routine Description: + + This routine returns the cumulative workingset_refault_file counter from + /proc/vmstat. A rising counter means file pages that were previously evicted + are being faulted back in -- i.e. reclaim (or workload pressure) is cutting + into the live working set, which is used as a brake on further reclaim. + +Arguments: + + None. + +Return Value: + + The cumulative file refault count (in pages), or -1 if unavailable. + +--*/ + +{ + // + // /proc/vmstat is several KB and the counter appears well into the file, so a + // generous buffer (filled across reads by ReadProcFile) avoids truncating it. + // + + char Buffer[16384]; + if (!ReadProcFile("/proc/vmstat", Buffer, sizeof(Buffer))) + { + return -1; + } + + const char* Marker = strstr(Buffer, "workingset_refault_file "); + if (Marker == nullptr) + { + return -1; + } + + return strtoll(Marker + (sizeof("workingset_refault_file ") - 1), nullptr, 10); +} + namespace { // @@ -3795,17 +3868,31 @@ constexpr unsigned long long c_busyThresholdPerMille = 5; // 0.5% constexpr int c_dropCacheIdleIntervals = 30; // 5 minutes constexpr long long c_cacheGrowthRearmBytes = 256ll * 1024 * 1024; -// Gradual: reclaimable cache below this floor is always retained; only the excess above it (beyond a -// hysteresis margin) is reclaimed. +// Gradual: PSI "some avg10" thresholds. Reclaim only when pressure is below the reclaim ceiling; raise +// the floor and back off when pressure exceeds the backoff floor. The band in between is a dead zone +// that prevents oscillation. +constexpr double c_pressureReclaimMax = 1.0; +constexpr double c_pressureBackoffMin = 5.0; + +// Gradual: back off if more than this many file pages were refaulted in one interval (~40 MB at 4 KB +// pages), indicating reclaim is cutting into the live working set. +constexpr long long c_refaultBackoffPages = 10000; + +// Gradual: adaptive floor of reclaimable cache to retain. It starts small, doubles (up to the cap) +// whenever reclaim hurts, and decays back toward the base only after sustained calm so a learned working +// set is held steady instead of being re-probed (and re-evicted) every tick. The cap is scaled to a +// fraction of guest RAM at startup (see c_floorMaxRam*), so large working sets on large VMs can be fully +// protected; the fallback applies only if total RAM cannot be determined. constexpr long long c_floorBaseBytes = 128ll * 1024 * 1024; +constexpr long long c_floorMaxFallbackBytes = 4096ll * 1024 * 1024; +constexpr long long c_floorMaxRamNumerator = 3; +constexpr long long c_floorMaxRamDenominator = 4; +constexpr long long c_floorDecayBytes = 64ll * 1024 * 1024; +constexpr int c_floorDecayAfterCalmTicks = 6; // 60s constexpr long long c_gradualHysteresisBytes = 128ll * 1024 * 1024; -// Gradual is driven by PSI: reclaim cold cache while the "some avg10" memory pressure is below this -// value (in percent), and back off above it. -constexpr double c_pressureReclaimMax = 1.0; - -// While the VM is busy, reclaim at most this much per interval so a large backlog is drained gently; -// an idle interval drains the full excess at once. +// Gradual: when the VM is busy, reclaim at most this much per interval (cautious, so a burst of refaults +// can brake it before too much is evicted); when idle, drain the full excess. constexpr long long c_gradualStepBusyBytes = 256ll * 1024 * 1024; // Compaction runs once free memory grows by at least this much since the last compaction. @@ -3827,21 +3914,64 @@ struct MemoryReclaimState bool ReclaimedThisIdlePeriod = false; long long CacheAtLastDrop = 0; + // Gradual. + long long FloorBytes = c_floorBaseBytes; + long long FloorMaxBytes = c_floorMaxFallbackBytes; + long long PreviousRefaults = -1; + int CalmStreak = 0; + // Compaction. long long FreeAtLastCompaction = 0; }; +void RaiseFloorToProtect(MemoryReclaimState& State, long long Cache) + +/*++ + +Routine Description: + + Reclaim is hurting (real pressure, or the workload is refaulting evicted file pages). Rather than + ramping the floor up over many thrashing ticks, this immediately raises it to protect the cache that + is actually in use: it jumps to at least the current reclaimable cache (plus a margin) so reclaim + stops cutting into the working set on this very tick. The slow calm-time decay then re-probes downward + gently, so an over-estimate self-corrects without re-thrashing. + +--*/ + +{ + const long long Protect = (Cache < 0) ? (State.FloorBytes * 2) : (Cache + c_gradualHysteresisBytes); + State.FloorBytes = (std::min)(State.FloorMaxBytes, (std::max)(State.FloorBytes * 2, Protect)); + State.CalmStreak = 0; +} + +void DecayFloorAfterCalm(MemoryReclaimState& State) + +/*++ + +Routine Description: + + Relaxes the floor toward the base, but only after sustained calm, so a learned working set is not + immediately re-probed (and re-evicted) the moment pressure subsides. + +--*/ + +{ + State.CalmStreak += 1; + if (State.CalmStreak >= c_floorDecayAfterCalmTicks && State.FloorBytes > c_floorBaseBytes) + { + State.FloorBytes = (std::max)(c_floorBaseBytes, State.FloorBytes - c_floorDecayBytes); + } +} + bool RunGradualTick(MemoryReclaimState& State, bool IntervalIdle) /*++ Routine Description: - Runs one interval of pressure-driven gentle reclaim (cold-first via cgroup memory.reclaim) toward a - fixed floor. While the kernel reports little/no memory pressure (PSI) there is cold cache the guest is - not really using, so it is reclaimed -- even while the VM is busy. A busy interval reclaims at most a - bounded step so a large backlog is drained gently; an idle interval drains the full excess. When PSI - is unavailable, reclaim falls back to gating on CPU idle. + Runs one interval of pressure-driven gentle reclaim (cold-first via cgroup memory.reclaim) toward the + adaptive floor. The refault rate is refreshed every interval so it reflects current activity + regardless of whether reclaim runs. Return Value: @@ -3850,53 +3980,96 @@ Return Value: --*/ { - (void)State; - const double Pressure = GetMemoryPressureAvg10(); + const long long Refaults = GetFileRefaults(); + const long long RefaultDelta = (Refaults >= 0 && State.PreviousRefaults >= 0) ? (Refaults - State.PreviousRefaults) : 0; + if (Refaults >= 0) + { + State.PreviousRefaults = Refaults; + } + + const bool RefaultBrake = RefaultDelta >= c_refaultBackoffPages; + const long long Cache = GetReclaimableCacheBytes(); bool MayReclaim; if (Pressure < 0.0) { // - // No PSI brake available: gate reclaim on CPU idle. + // No PSI brake available: gate reclaim on CPU idle, but still honor the refault brake so reclaim + // backs off (and the floor is raised to protect the working set) if it starts faulting evicted + // file pages back in. On calm idle intervals decay the floor as in the PSI path so a learned + // working set is eventually re-probed downward. // - MayReclaim = IntervalIdle; + if (RefaultBrake) + { + RaiseFloorToProtect(State, Cache); + MayReclaim = false; + } + else if (IntervalIdle) + { + DecayFloorAfterCalm(State); + MayReclaim = true; + } + else + { + State.CalmStreak = 0; + MayReclaim = false; + } } - else + else if (Pressure >= c_pressureBackoffMin || RefaultBrake) + { + RaiseFloorToProtect(State, Cache); + MayReclaim = false; + } + else if (Pressure >= c_pressureReclaimMax) { // - // Reclaim only while pressure is low; back off once the workload starts stalling on memory. + // Dead zone: some pressure but not enough to back off. Hold steady. // - MayReclaim = Pressure < c_pressureReclaimMax; + State.CalmStreak = 0; + MayReclaim = false; } - - if (!MayReclaim) + else { - return false; - } + // + // Calm: relax the floor toward the base after sustained calm, then reclaim cold excess. + // - const long long Cache = GetReclaimableCacheBytes(); - if (Cache < 0) - { - return false; + DecayFloorAfterCalm(State); + MayReclaim = true; } - const long long Excess = Cache - c_floorBaseBytes; - if (Excess <= c_gradualHysteresisBytes) + bool Reclaimed = false; + if (MayReclaim && Cache >= 0) { - return false; - } + const long long Excess = Cache - State.FloorBytes; + if (Excess > c_gradualHysteresisBytes) + { + const long long ToFree = IntervalIdle ? Excess : (std::min)(Excess, c_gradualStepBusyBytes); + const std::string Bytes = std::to_string(ToFree); + + // Best-effort: WriteToFile logs internally on failure. EAGAIN merely means the kernel could + // not evict the full amount this pass (pages were still freed), so it counts as reclaim. + // Never throw on a transient write error and tear down the long-lived reduction thread. + const int Status = WriteToFile(RECLAIM_PATH, Bytes.c_str()); + Reclaimed = (Status == 0) || (errno == EAGAIN); - const long long ToFree = IntervalIdle ? Excess : (std::min)(Excess, c_gradualStepBusyBytes); - const std::string Bytes = std::to_string(ToFree); + // + // Reset the refault baseline to post-reclaim so the next interval's delta attributes refaults + // to this reclaim. + // - // Best-effort: WriteToFile logs internally on failure. EAGAIN merely means the kernel could not evict - // the full amount this pass (pages were still freed), so it counts as reclaim. Never throw on a - // transient write error and tear down the long-lived reduction thread. - const int Status = WriteToFile(RECLAIM_PATH, Bytes.c_str()); - return (Status == 0) || (errno == EAGAIN); + const long long After = GetFileRefaults(); + if (After >= 0) + { + State.PreviousRefaults = After; + } + } + } + + return Reclaimed; } bool RunDropCacheTick(MemoryReclaimState& State, bool IntervalIdle) @@ -3999,11 +4172,12 @@ Routine Description: 1. Gradual mode (gentle, cold-first via cgroup memory.reclaim) is driven by *memory pressure*, not CPU idleness. While the kernel reports little/no memory pressure (PSI), there is cold - memory the guest is not really using, so it is reclaimed toward a fixed floor -- even while - the VM is busy. This is important because a CPU-bound workload can sit on gigabytes of cold - cache that a CPU-idle check would never reclaim. A busy interval reclaims at most a bounded - step so a large backlog is drained gently; an idle interval drains the full excess. When PSI - is unavailable, Gradual falls back to reclaiming only while CPU-idle. + memory the guest is not really using, so it is reclaimed toward an adaptive floor -- even + while the VM is busy. This is important because a CPU-bound workload can sit on gigabytes of + cold cache that a CPU-idle check would never reclaim. The amount of file refaults is used as a + brake: if reclaim starts faulting evicted file pages back in (or pressure rises), the floor is + raised and reclaim backs off, so the loop self-regulates to the true working-set size. When + PSI is unavailable, Gradual falls back to reclaiming only while CPU-idle. 2. DropCache mode (the indiscriminate sledgehammer: drop_caches evicts the entire page cache, hot and cold alike) cannot run safely under load, so it stays gated on sustained CPU idle. It @@ -4059,6 +4233,17 @@ try MemoryReclaimState State; + // + // Scale the adaptive floor cap to a fraction of guest RAM so large working sets on large VMs + // can be fully protected, falling back to a fixed cap only if total RAM is unavailable. + // + + const long long TotalRam = GetTotalMemoryBytes(); + if (TotalRam > 0) + { + State.FloorMaxBytes = (std::max)(c_floorBaseBytes, (TotalRam * c_floorMaxRamNumerator) / c_floorMaxRamDenominator); + } + for (;;) { std::this_thread::sleep_for(c_pollInterval); @@ -4079,6 +4264,7 @@ try State.PreviousBusy = Busy; State.PreviousIdle = Idle; State.HavePreviousSample = true; + State.PreviousRefaults = GetFileRefaults(); continue; } From 20a38ef791de868767bac55bfbe294bb5dc3f6b4 Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Thu, 11 Jun 2026 10:20:45 -0700 Subject: [PATCH 4/4] Make gradual the default memory reclaim mode Now that gradual reclaim is pressure-driven and protects the working set with an adaptive floor, it reclaims more cold memory than the idle-gated drop_caches sledgehammer while being safe to run under load. Make it the default for both the WSL2 VM mini-init (via the config default that flows to the guest) and the WSLc container init. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/linux/init/WSLCInit.cpp | 2 +- src/windows/common/WslCoreConfig.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/linux/init/WSLCInit.cpp b/src/linux/init/WSLCInit.cpp index c85efe79e..01aebe165 100644 --- a/src/linux/init/WSLCInit.cpp +++ b/src/linux/init/WSLCInit.cpp @@ -741,7 +741,7 @@ void HandleMessageImpl( // Start the memory reduction thread now that procfs is in its final location. static std::once_flag memoryReductionFlag; - std::call_once(memoryReductionFlag, [] { StartMemoryReductionThread(LxMiniInitMemoryReclaimModeDropCache); }); + std::call_once(memoryReductionFlag, [] { StartMemoryReductionThread(LxMiniInitMemoryReclaimModeGradual); }); } response.Result = 0; diff --git a/src/windows/common/WslCoreConfig.h b/src/windows/common/WslCoreConfig.h index 8e48f04dc..eb7a15f37 100644 --- a/src/windows/common/WslCoreConfig.h +++ b/src/windows/common/WslCoreConfig.h @@ -361,7 +361,7 @@ struct Config bool EnableHardwarePerformanceCounters = !shared::Arm64; bool EnableAutoProxy = true; int InitialAutoProxyTimeout = 1000; - MemoryReclaimMode MemoryReclaim = MemoryReclaimMode::DropCache; + MemoryReclaimMode MemoryReclaim = MemoryReclaimMode::Gradual; bool EnableSparseVhd = false; UINT64 VhdSizeBytes = 0x10000000000; // 1TB