From 20ba5b5a0b4331edba03061a7df0e2b640678589 Mon Sep 17 00:00:00 2001 From: Jevan Saks Date: Fri, 12 Jun 2026 11:54:23 -0700 Subject: [PATCH 1/2] Serialize Linux test runs and capture crash dumps as a dedicated artifact MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Linux CI leg keeps failing with Catastrophic failure: Test process crashed with exit code 137' (SIGKILL = kernel OOM-killer). The previous attempt to fix this (#1722) set XUNIT_MAX_PARALLEL_THREADS=1, -p:BuildInParallel=false, and RunConfiguration.MaxCpuCount=1, but those options only control parallelism *within* a single test assembly. They do not stop dotnet test ' from launching vstest hosts for multiple test projects concurrently, which is what causes two multi-GB generator test hosts (CsWin32Generator.Tests and Microsoft.Windows.CsWin32.Tests) to run at the same time and OOM the agent. The build logs confirm both hosts were live concurrently after that change. This change: * Removes the ineffective parallelism env vars from #1722. * On non-Windows agents, enumerates est/*.Tests' projects and invokes dotnet test' once per project — mirroring the working pattern used by the GitHub Actions Linux job in .github/workflows/build.yml'. * Captures ree -h' and /proc/meminfo' before each project, and dumps dmesg' (where the OOM-killer logs) plus a memory snapshot after any failed run — so future OOMs are diagnosable without guessing. * Enables the .NET runtime mini-dump fallback (heap dumps) in case a managed abort precedes the kill. * Sweeps any captured *.dmp' / core.*' / coredump.*' files into a new crashDumps' artifact registered in build.yml, so they are easy to download without grabbing the 7+ GB testResults bundle. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- azure-pipelines/build.yml | 2 + tools/artifacts/crashDumps.ps1 | 19 +++++ tools/dotnet-test-cloud.ps1 | 151 +++++++++++++++++++++++++++------ 3 files changed, 147 insertions(+), 25 deletions(-) create mode 100644 tools/artifacts/crashDumps.ps1 diff --git a/azure-pipelines/build.yml b/azure-pipelines/build.yml index 661e15b5..f455d99f 100644 --- a/azure-pipelines/build.yml +++ b/azure-pipelines/build.yml @@ -31,6 +31,8 @@ parameters: default: - name: build_logs - name: coverageResults + - name: crashDumps + testOnly: true - name: deployables sbomEnabled: true - name: projectAssetsJson diff --git a/tools/artifacts/crashDumps.ps1 b/tools/artifacts/crashDumps.ps1 new file mode 100644 index 00000000..dc18c4c6 --- /dev/null +++ b/tools/artifacts/crashDumps.ps1 @@ -0,0 +1,19 @@ +[CmdletBinding()] +Param( +) + +$result = @{} + +# Crash dumps and related diagnostics are staged into $(Build.ArtifactStagingDirectory)/crashDumps +# by tools/dotnet-test-cloud.ps1 when --blame-crash or the .NET DbgEnableMiniDump runtime +# environment captures a dump (e.g. for OOM-killed test hosts on Linux). +$artifactStaging = & "$PSScriptRoot/../Get-ArtifactsStagingDirectory.ps1" +$dumpsPath = Join-Path $artifactStaging 'crashDumps' +if (Test-Path $dumpsPath) { + $files = @(Get-ChildItem $dumpsPath -Recurse -File) + if ($files.Count -gt 0) { + $result[$dumpsPath] = $files + } +} + +$result diff --git a/tools/dotnet-test-cloud.ps1 b/tools/dotnet-test-cloud.ps1 index a97abb33..58695e8c 100644 --- a/tools/dotnet-test-cloud.ps1 +++ b/tools/dotnet-test-cloud.ps1 @@ -46,34 +46,133 @@ if ($x86) { $testBinLog = Join-Path $ArtifactStagingFolder (Join-Path build_logs test.binlog) $testDiagLog = Join-Path $ArtifactStagingFolder (Join-Path test_logs diag.log) +$dumpStagingFolder = Join-Path $ArtifactStagingFolder 'crashDumps' -# On Linux/macOS, limit test parallelism to avoid OOM kills. -# The generator test projects each consume several GB of RAM; running multiple test hosts -# simultaneously on memory-constrained Linux agents causes the OOM killer to terminate -# test processes (exit code 137). We limit all layers: -# 1. xunit parallelism within each test assembly (XUNIT_MAX_PARALLEL_THREADS=1) -# 2. MSBuild project-level parallelism (BuildInParallel=false) -# 3. vstest host parallelism (MaxCpuCount=1 via runsettings) -# Windows agents have enough RAM for full parallelism. -$extraTestArgs = @() -if (!$IsWindows) { - $env:XUNIT_MAX_PARALLEL_THREADS = '1' - $extraTestArgs = @('-p:BuildInParallel=false', '--', 'RunConfiguration.MaxCpuCount=1') - Write-Host "Limiting test parallelism to avoid OOM on Linux" -ForegroundColor Cyan +function Invoke-DotnetTest { + param( + [Parameter(Mandatory)][string]$Target, + [string]$BinLogSuffix = '' + ) + + $binLogPath = if ($BinLogSuffix) { + Join-Path $ArtifactStagingFolder (Join-Path build_logs "test.$BinLogSuffix.binlog") + } else { + $testBinLog + } + + & $dotnet test $Target ` + --no-build ` + -c $Configuration ` + --filter "TestCategory!=HighMemory&TestCategory!=RequiresHardware$env:TESTFILTER" ` + --collect "Code Coverage;Format=cobertura" ` + --settings "$PSScriptRoot/test.runsettings" ` + --blame-hang-timeout 1500s ` + --blame-crash ` + -bl:"$binLogPath" ` + --diag "$testDiagLog;TraceLevel=info" ` + --logger trx +} + +function Write-MemorySnapshot([string]$Label) { + if (-not $IsLinux) { return } + Write-Host "" + Write-Host "==== $Label ====" -ForegroundColor Yellow + & free -h 2>&1 | Out-Host + Write-Host "---- /proc/meminfo (top 10) ----" + Get-Content /proc/meminfo -TotalCount 10 -ErrorAction SilentlyContinue | Out-Host } -& $dotnet test $RepoRoot ` - --no-build ` - -c $Configuration ` - --filter "TestCategory!=HighMemory&TestCategory!=RequiresHardware$env:TESTFILTER" ` - --collect "Code Coverage;Format=cobertura" ` - --settings "$PSScriptRoot/test.runsettings" ` - --blame-hang-timeout 1500s ` - --blame-crash ` - -bl:"$testBinLog" ` - --diag "$testDiagLog;TraceLevel=info" ` - --logger trx ` - @extraTestArgs +function Write-DmesgTail { + if (-not $IsLinux) { return } + Write-Host "" + Write-Host "==== dmesg tail (looking for OOM killer messages) ====" -ForegroundColor Yellow + # dmesg requires CAP_SYSLOG; try direct first, then sudo (most ADO Linux agents have passwordless sudo). + $out = & dmesg --ctime 2>$null | Select-Object -Last 200 + if ($LASTEXITCODE -ne 0 -or -not $out) { + $out = & sudo -n dmesg --ctime 2>$null | Select-Object -Last 200 + } + if ($LASTEXITCODE -ne 0 -or -not $out) { + Write-Host '(dmesg unavailable: kernel.dmesg_restrict=1 and sudo not permitted)' -ForegroundColor DarkGray + } else { + $out | Out-Host + } +} + +# Enable .NET runtime crash dumps on managed unhandled exceptions / aborts. +# These are emitted in addition to the dumps captured by `--blame-crash`, +# and survive scenarios where blame's createdump invocation cannot fire +# (for example, SIGKILL by the kernel OOM-killer never reaches managed code, +# but a runtime abort / unhandled exception that precedes the kill is captured). +New-Item -ItemType Directory -Force -Path $dumpStagingFolder | Out-Null +# Always drop a readme so the artifact upload has at least one file to publish. +@' +This artifact collects test-host crash dumps captured by `dotnet test --blame-crash` +and by the .NET runtime (DOTNET_DbgEnableMiniDump). + +If the artifact only contains this README, no managed crashes were captured. That is +often the case when a test host is killed by the kernel (e.g. the Linux OOM-killer +sends SIGKILL), since SIGKILL gives the runtime no opportunity to write a dump. +In that case, inspect the test step's console output for memory diagnostics and +the `dmesg` tail that the test script captures after a failure. +'@ | Set-Content -Path (Join-Path $dumpStagingFolder 'README.txt') +$env:DOTNET_DbgEnableMiniDump = '1' +$env:DOTNET_DbgMiniDumpType = '2' # 2 = Heap (managed heap + threads; smaller than full memory) +$env:DOTNET_DbgMiniDumpName = (Join-Path $dumpStagingFolder 'coredump.%p.%t.dmp') +$env:DOTNET_CreateDumpDiagnostics = '1' + +$overallExitCode = 0 + +if (-not $IsWindows -and -not $x86) { + # On Linux/macOS the generator test projects each consume several GB of RAM. + # `dotnet test ` runs vstest hosts for multiple test projects in parallel, + # and on memory-constrained agents that causes the kernel OOM-killer to terminate + # one of the hosts (exit code 137 = SIGKILL). The previous attempt to fix this + # via XUNIT_MAX_PARALLEL_THREADS / BuildInParallel / RunConfiguration.MaxCpuCount + # only limits parallelism *within* a single test assembly, not across them. + # + # Mirror the working GitHub Actions Linux workflow: enumerate test projects and + # invoke `dotnet test` once per project, serially. + Write-Host 'Non-Windows agent: serializing test runs project-by-project to avoid OOM kills.' -ForegroundColor Cyan + Write-MemorySnapshot 'Pre-test memory state' + + $testProjects = Get-ChildItem -Path "$RepoRoot/test" -Directory | + Where-Object Name -Like '*.Tests' | + Sort-Object Name | + ForEach-Object { Get-ChildItem -Path $_.FullName -Filter '*.csproj' | Select-Object -First 1 } | + Where-Object { $_ } + + foreach ($proj in $testProjects) { + $projName = $proj.BaseName + Write-Host '' + Write-Host "▶️ Running tests in $projName" -ForegroundColor Cyan + Write-MemorySnapshot "Memory before $projName" + Invoke-DotnetTest -Target $proj.FullName -BinLogSuffix $projName + $thisExit = $LASTEXITCODE + if ($thisExit -ne 0) { + Write-Host "❌ Tests in $projName exited with code $thisExit" -ForegroundColor Red + $overallExitCode = $thisExit + Write-MemorySnapshot "Post-failure memory state ($projName)" + Write-DmesgTail + } + } +} else { + Invoke-DotnetTest -Target $RepoRoot + $overallExitCode = $LASTEXITCODE +} + +# Move any captured crash dumps (from --blame-crash or DOTNET_DbgEnableMiniDump) into +# the dedicated staging folder so they're easy to find in the published artifact. +Get-ChildItem -Path "$RepoRoot/test" -Recurse -File -ErrorAction SilentlyContinue | + Where-Object { $_.Name -like '*.dmp' -or $_.Name -like 'core.*' -or $_.Name -like 'coredump.*' } | + ForEach-Object { + $dest = Join-Path $dumpStagingFolder $_.Name + try { + Move-Item -Path $_.FullName -Destination $dest -Force -ErrorAction Stop + Write-Host "Collected crash dump: $($_.Name) ($([math]::Round($_.Length / 1MB, 1)) MB)" + } catch { + Write-Host "Failed to move crash dump $($_.FullName): $_" -ForegroundColor Yellow + } + } $unknownCounter = 0 Get-ChildItem -Recurse -Path $RepoRoot\test\*.trx |% { @@ -100,3 +199,5 @@ Get-ChildItem -Recurse -Path $RepoRoot\test\*.trx |% { Write-Host "##vso[results.publish type=VSTest;runTitle=$runTitle;publishRunAttachments=true;resultFiles=$_;failTaskOnFailedTests=true;testRunSystem=VSTS - PTR;]" } } + +exit $overallExitCode From 162ed15e96665680d47ab65196f2f968dfdd2925 Mon Sep 17 00:00:00 2001 From: Jevan Saks Date: Fri, 12 Jun 2026 12:14:27 -0700 Subject: [PATCH 2/2] Force MSBuild -m:1 instead of per-csproj loop (preserves sln NonWindows filter) The per-csproj enumeration in 20ba5b5 bypassed the solution's NonWindows configuration, which filters out Windows-only test projects on Linux/macOS. It also passed each csproj to dotnet test directly, which on .NET 10 SDK caused vstest to reject the test exe path with 'argument is invalid' for the small projects (CsWin32Generator.BuildTasks.Tests, GenerationSandbox.*). Switch back to the solution-level invocation and instead serialize via MSBuild -m:1 (single worker node), which causes dotnet test to run the VSTest target one project at a time and prevents the OOM-killer from terminating concurrent heavy test hosts. Also tighten the dmesg diagnostic to surface OOM-related lines only. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tools/dotnet-test-cloud.ps1 | 115 +++++++++++++++--------------------- 1 file changed, 46 insertions(+), 69 deletions(-) diff --git a/tools/dotnet-test-cloud.ps1 b/tools/dotnet-test-cloud.ps1 index 58695e8c..d1e27148 100644 --- a/tools/dotnet-test-cloud.ps1 +++ b/tools/dotnet-test-cloud.ps1 @@ -48,53 +48,34 @@ $testBinLog = Join-Path $ArtifactStagingFolder (Join-Path build_logs test.binlog $testDiagLog = Join-Path $ArtifactStagingFolder (Join-Path test_logs diag.log) $dumpStagingFolder = Join-Path $ArtifactStagingFolder 'crashDumps' -function Invoke-DotnetTest { - param( - [Parameter(Mandatory)][string]$Target, - [string]$BinLogSuffix = '' - ) - - $binLogPath = if ($BinLogSuffix) { - Join-Path $ArtifactStagingFolder (Join-Path build_logs "test.$BinLogSuffix.binlog") - } else { - $testBinLog - } - - & $dotnet test $Target ` - --no-build ` - -c $Configuration ` - --filter "TestCategory!=HighMemory&TestCategory!=RequiresHardware$env:TESTFILTER" ` - --collect "Code Coverage;Format=cobertura" ` - --settings "$PSScriptRoot/test.runsettings" ` - --blame-hang-timeout 1500s ` - --blame-crash ` - -bl:"$binLogPath" ` - --diag "$testDiagLog;TraceLevel=info" ` - --logger trx -} - function Write-MemorySnapshot([string]$Label) { if (-not $IsLinux) { return } Write-Host "" Write-Host "==== $Label ====" -ForegroundColor Yellow & free -h 2>&1 | Out-Host - Write-Host "---- /proc/meminfo (top 10) ----" - Get-Content /proc/meminfo -TotalCount 10 -ErrorAction SilentlyContinue | Out-Host + Write-Host "---- /proc/meminfo (top 12) ----" + Get-Content /proc/meminfo -TotalCount 12 -ErrorAction SilentlyContinue | Out-Host } function Write-DmesgTail { if (-not $IsLinux) { return } Write-Host "" Write-Host "==== dmesg tail (looking for OOM killer messages) ====" -ForegroundColor Yellow - # dmesg requires CAP_SYSLOG; try direct first, then sudo (most ADO Linux agents have passwordless sudo). - $out = & dmesg --ctime 2>$null | Select-Object -Last 200 - if ($LASTEXITCODE -ne 0 -or -not $out) { - $out = & sudo -n dmesg --ctime 2>$null | Select-Object -Last 200 - } - if ($LASTEXITCODE -ne 0 -or -not $out) { + # dmesg requires CAP_SYSLOG on most agents; try sudo (passwordless on ADO Linux pools). + # Filter to entries that mention oom/kill/Killed so the log stays compact. + $cmd = "(sudo -n dmesg --ctime 2>/dev/null || dmesg --ctime 2>/dev/null) | tail -n 400" + $out = & bash -c $cmd 2>$null + if (-not $out) { Write-Host '(dmesg unavailable: kernel.dmesg_restrict=1 and sudo not permitted)' -ForegroundColor DarkGray + return + } + # Surface OOM-relevant lines first, then a compact tail of everything else. + $oomLines = $out | Select-String -Pattern '(oom|killed|Killed|invoked oom-killer|Out of memory|memory cgroup)' -CaseSensitive:$false + if ($oomLines) { + Write-Host "---- OOM-related entries ----" -ForegroundColor Red + $oomLines | ForEach-Object { Write-Host $_.Line } } else { - $out | Out-Host + Write-Host "(no OOM-killer entries detected in dmesg tail)" -ForegroundColor DarkGray } } @@ -120,44 +101,40 @@ $env:DOTNET_DbgMiniDumpType = '2' # 2 = Heap (managed heap + threads; smaller th $env:DOTNET_DbgMiniDumpName = (Join-Path $dumpStagingFolder 'coredump.%p.%t.dmp') $env:DOTNET_CreateDumpDiagnostics = '1' -$overallExitCode = 0 - +# On Linux/macOS, the heavy generator test projects each consume several GB of RAM, +# and the default `dotnet test ` schedules MSBuild's VSTest target for multiple +# projects in parallel — causing the kernel OOM-killer to terminate test hosts on the +# memory-constrained ADO agents (exit code 137 = SIGKILL). Force a single MSBuild node +# so VSTest is invoked one project at a time. Use the solution-level invocation so that +# the sln's `NonWindows` configuration correctly filters out Windows-only projects. +$extraTestArgs = @() if (-not $IsWindows -and -not $x86) { - # On Linux/macOS the generator test projects each consume several GB of RAM. - # `dotnet test ` runs vstest hosts for multiple test projects in parallel, - # and on memory-constrained agents that causes the kernel OOM-killer to terminate - # one of the hosts (exit code 137 = SIGKILL). The previous attempt to fix this - # via XUNIT_MAX_PARALLEL_THREADS / BuildInParallel / RunConfiguration.MaxCpuCount - # only limits parallelism *within* a single test assembly, not across them. - # - # Mirror the working GitHub Actions Linux workflow: enumerate test projects and - # invoke `dotnet test` once per project, serially. - Write-Host 'Non-Windows agent: serializing test runs project-by-project to avoid OOM kills.' -ForegroundColor Cyan + Write-Host 'Non-Windows agent: forcing single MSBuild node (-m:1) to serialize test runs.' -ForegroundColor Cyan + $extraTestArgs += '-m:1' + # Also restrain xunit's intra-assembly parallelism: while it does not prevent + # cross-project OOM on its own, it reduces peak RSS during the heavy test runs. + $env:XUNIT_MAX_PARALLEL_THREADS = '1' Write-MemorySnapshot 'Pre-test memory state' +} - $testProjects = Get-ChildItem -Path "$RepoRoot/test" -Directory | - Where-Object Name -Like '*.Tests' | - Sort-Object Name | - ForEach-Object { Get-ChildItem -Path $_.FullName -Filter '*.csproj' | Select-Object -First 1 } | - Where-Object { $_ } - - foreach ($proj in $testProjects) { - $projName = $proj.BaseName - Write-Host '' - Write-Host "▶️ Running tests in $projName" -ForegroundColor Cyan - Write-MemorySnapshot "Memory before $projName" - Invoke-DotnetTest -Target $proj.FullName -BinLogSuffix $projName - $thisExit = $LASTEXITCODE - if ($thisExit -ne 0) { - Write-Host "❌ Tests in $projName exited with code $thisExit" -ForegroundColor Red - $overallExitCode = $thisExit - Write-MemorySnapshot "Post-failure memory state ($projName)" - Write-DmesgTail - } - } -} else { - Invoke-DotnetTest -Target $RepoRoot - $overallExitCode = $LASTEXITCODE +& $dotnet test $RepoRoot ` + --no-build ` + -c $Configuration ` + --filter "TestCategory!=HighMemory&TestCategory!=RequiresHardware$env:TESTFILTER" ` + --collect "Code Coverage;Format=cobertura" ` + --settings "$PSScriptRoot/test.runsettings" ` + --blame-hang-timeout 1500s ` + --blame-crash ` + -bl:"$testBinLog" ` + --diag "$testDiagLog;TraceLevel=info" ` + --logger trx ` + @extraTestArgs + +$overallExitCode = $LASTEXITCODE +if ($overallExitCode -ne 0) { + Write-Host "❌ dotnet test exited with code $overallExitCode" -ForegroundColor Red + Write-MemorySnapshot 'Post-failure memory state' + Write-DmesgTail } # Move any captured crash dumps (from --blame-crash or DOTNET_DbgEnableMiniDump) into