Skip to content

Commit f56c61b

Browse files
Capture dump of hanging test process in Helix (#21659)
1 parent 5a0c097 commit f56c61b

File tree

7 files changed

+143
-52
lines changed

7 files changed

+143
-52
lines changed

eng/helix/content/RunTests/ProcessUtil.cs

Lines changed: 63 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System;
66
using System.Collections.Generic;
77
using System.Diagnostics;
8+
using System.IO;
89
using System.Runtime.InteropServices;
910
using System.Text;
1011
using System.Threading;
@@ -19,10 +20,53 @@ public static class ProcessUtil
1920
[DllImport("libc", SetLastError = true, EntryPoint = "kill")]
2021
private static extern int sys_kill(int pid, int sig);
2122

23+
public static Task CaptureDumpAsync()
24+
{
25+
var dumpDirectoryPath = Environment.GetEnvironmentVariable("HELIX_DUMP_FOLDER");
26+
27+
if (dumpDirectoryPath == null)
28+
{
29+
return Task.CompletedTask;
30+
}
31+
32+
var process = Process.GetCurrentProcess();
33+
var dumpFilePath = Path.Combine(dumpDirectoryPath, $"{process.ProcessName}-{process.Id}.dmp");
34+
35+
return CaptureDumpAsync(process.Id, dumpFilePath);
36+
}
37+
38+
public static Task CaptureDumpAsync(int pid)
39+
{
40+
var dumpDirectoryPath = Environment.GetEnvironmentVariable("HELIX_DUMP_FOLDER");
41+
42+
if (dumpDirectoryPath == null)
43+
{
44+
return Task.CompletedTask;
45+
}
46+
47+
var process = Process.GetProcessById(pid);
48+
var dumpFilePath = Path.Combine(dumpDirectoryPath, $"{process.ProcessName}.{process.Id}.dmp");
49+
50+
return CaptureDumpAsync(process.Id, dumpFilePath);
51+
}
52+
53+
public static Task CaptureDumpAsync(int pid, string dumpFilePath)
54+
{
55+
// Skip this on OSX, we know it's unsupported right now
56+
if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
57+
{
58+
// Can we capture stacks or do a gcdump instead?
59+
return Task.CompletedTask;
60+
}
61+
62+
return RunAsync($"{Environment.GetEnvironmentVariable("HELIX_WORKITEM_ROOT")}/dotnet-dump", $"collect -p {pid} -o \"{dumpFilePath}\"");
63+
}
64+
2265
public static async Task<ProcessResult> RunAsync(
2366
string filename,
2467
string arguments,
2568
string? workingDirectory = null,
69+
string dumpDirectoryPath = null,
2670
bool throwOnError = true,
2771
IDictionary<string, string?>? environmentVariables = null,
2872
Action<string>? outputDataReceived = null,
@@ -51,6 +95,14 @@ public static async Task<ProcessResult> RunAsync(
5195
process.StartInfo.WorkingDirectory = workingDirectory;
5296
}
5397

98+
dumpDirectoryPath ??= Environment.GetEnvironmentVariable("HELIX_DUMP_FOLDER");
99+
100+
if (dumpDirectoryPath != null)
101+
{
102+
process.StartInfo.EnvironmentVariables["COMPlus_DbgEnableMiniDump"] = "1";
103+
process.StartInfo.EnvironmentVariables["COMPlus_DbgMiniDumpName"] = Path.Combine(dumpDirectoryPath, $"{Path.GetFileName(filename)}.%d.dmp");
104+
}
105+
54106
if (environmentVariables != null)
55107
{
56108
foreach (var kvp in environmentVariables)
@@ -112,13 +164,20 @@ public static async Task<ProcessResult> RunAsync(
112164
process.BeginOutputReadLine();
113165
process.BeginErrorReadLine();
114166

115-
var cancelledTcs = new TaskCompletionSource<object?>();
116-
await using var _ = cancellationToken.Register(() => cancelledTcs.TrySetResult(null));
167+
var canceledTcs = new TaskCompletionSource<object?>();
168+
await using var _ = cancellationToken.Register(() => canceledTcs.TrySetResult(null));
117169

118-
var result = await Task.WhenAny(processLifetimeTask.Task, cancelledTcs.Task);
170+
var result = await Task.WhenAny(processLifetimeTask.Task, canceledTcs.Task);
119171

120-
if (result == cancelledTcs.Task)
172+
if (result == canceledTcs.Task)
121173
{
174+
if (dumpDirectoryPath != null)
175+
{
176+
var dumpFilePath = Path.Combine(dumpDirectoryPath, $"{Path.GetFileName(filename)}.{process.Id}.dmp");
177+
// Capture a process dump if the dumpDirectory is set
178+
await CaptureDumpAsync(process.Id, dumpFilePath);
179+
}
180+
122181
if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
123182
{
124183
sys_kill(process.Id, sig: 2); // SIGINT
@@ -143,16 +202,5 @@ public static async Task<ProcessResult> RunAsync(
143202

144203
return await processLifetimeTask.Task;
145204
}
146-
147-
public static void KillProcess(int pid)
148-
{
149-
try
150-
{
151-
using var process = Process.GetProcessById(pid);
152-
process?.Kill();
153-
}
154-
catch (ArgumentException) { }
155-
catch (InvalidOperationException) { }
156-
}
157205
}
158206
}

eng/helix/content/RunTests/Program.cs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,6 @@
22
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
33

44
using System;
5-
using System.Collections.Generic;
6-
using System.CommandLine;
7-
using System.IO;
8-
using System.Runtime.InteropServices;
95
using System.Threading.Tasks;
106

117
namespace RunTests
@@ -14,7 +10,7 @@ class Program
1410
{
1511
static async Task Main(string[] args)
1612
{
17-
try
13+
try
1814
{
1915
var runner = new TestRunner(RunTestsOptions.Parse(args));
2016

@@ -27,6 +23,10 @@ static async Task Main(string[] args)
2723
{
2824
keepGoing = runner.InstallAspNetRefIfNeeded();
2925
}
26+
if (keepGoing)
27+
{
28+
keepGoing = await runner.InstallDotnetDump();
29+
}
3030

3131
runner.DisplayContents();
3232

eng/helix/content/RunTests/RunTestsOptions.cs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,16 +50,21 @@ public static RunTestsOptions Parse(string[] args)
5050
aliases: new string[] { "--ef" },
5151
description: "The version of the EF tool to use")
5252
{ Argument = new Argument<string>(), Required = true },
53-
53+
5454
new Option(
5555
aliases: new string[] { "--aspnetruntime" },
5656
description: "The path to the aspnet runtime nupkg to install")
5757
{ Argument = new Argument<string>(), Required = true },
58-
58+
5959
new Option(
6060
aliases: new string[] { "--aspnetref" },
6161
description: "The path to the aspnet ref nupkg to install")
6262
{ Argument = new Argument<string>(), Required = true },
63+
64+
new Option(
65+
aliases: new string[] { "--helixTimeout" },
66+
description: "The timeout duration of the Helix job")
67+
{ Argument = new Argument<string>(), Required = true },
6368
};
6469

6570
var parseResult = command.Parse(args);
@@ -73,6 +78,7 @@ public static RunTestsOptions Parse(string[] args)
7378
options.EfVersion = parseResult.ValueForOption<string>("--ef");
7479
options.AspNetRuntime = parseResult.ValueForOption<string>("--aspnetruntime");
7580
options.AspNetRef = parseResult.ValueForOption<string>("--aspnetref");
81+
options.Timeout = TimeSpan.Parse(parseResult.ValueForOption<string>("--helixTimeout"));
7682
options.HELIX_WORKITEM_ROOT = Environment.GetEnvironmentVariable("HELIX_WORKITEM_ROOT");
7783
options.Path = Environment.GetEnvironmentVariable("PATH");
7884
options.DotnetRoot = Environment.GetEnvironmentVariable("DOTNET_ROOT");
@@ -91,5 +97,6 @@ public static RunTestsOptions Parse(string[] args)
9197
public string HELIX_WORKITEM_ROOT { get; set;}
9298
public string DotnetRoot { get; set; }
9399
public string Path { get; set; }
100+
public TimeSpan Timeout { get; set; }
94101
}
95102
}

eng/helix/content/RunTests/TestRunner.cs

Lines changed: 43 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33

44
using System;
55
using System.Collections.Generic;
6-
using System.CommandLine;
76
using System.IO;
87
using System.IO.Compression;
98
using System.Runtime.InteropServices;
9+
using System.Threading;
1010
using System.Threading.Tasks;
1111

1212
namespace RunTests
@@ -24,15 +24,15 @@ public TestRunner(RunTestsOptions options)
2424

2525
public bool SetupEnvironment()
2626
{
27-
try
27+
try
2828
{
2929
// Rename default.NuGet.config to NuGet.config if there is not a custom one from the project
3030
// We use a local NuGet.config file to avoid polluting global machine state and avoid relying on global machine state
3131
if (!File.Exists("NuGet.config"))
3232
{
3333
File.Copy("default.NuGet.config", "NuGet.config");
3434
}
35-
35+
3636
EnvironmentVariables.Add("PATH", Options.Path);
3737
EnvironmentVariables.Add("DOTNET_ROOT", Options.DotnetRoot);
3838
EnvironmentVariables.Add("helix", Options.HelixQueue);
@@ -68,7 +68,7 @@ public bool SetupEnvironment()
6868

6969
public void DisplayContents(string path = "./")
7070
{
71-
try
71+
try
7272
{
7373
Console.WriteLine();
7474
Console.WriteLine($"Displaying directory contents for {path}:");
@@ -88,9 +88,9 @@ public void DisplayContents(string path = "./")
8888
}
8989
}
9090

91-
public async Task<bool> InstallAspNetAppIfNeededAsync()
91+
public async Task<bool> InstallAspNetAppIfNeededAsync()
9292
{
93-
try
93+
try
9494
{
9595
if (File.Exists(Options.AspNetRuntime))
9696
{
@@ -113,7 +113,7 @@ public async Task<bool> InstallAspNetAppIfNeededAsync()
113113
}
114114
}
115115
}
116-
116+
117117
DisplayContents(appRuntimePath);
118118

119119
Console.WriteLine($"Adding current directory to nuget sources: {Options.HELIX_WORKITEM_ROOT}");
@@ -152,7 +152,7 @@ await ProcessUtil.RunAsync($"{Options.DotnetRoot}/dotnet",
152152
Options.Path += $"{Environment.GetEnvironmentVariable("DOTNET_CLI_HOME")}/.dotnet/tools";
153153
EnvironmentVariables["PATH"] = Options.Path;
154154
}
155-
else
155+
else
156156
{
157157
Console.WriteLine($"No AspNetRuntime found: {Options.AspNetRuntime}, skipping...");
158158
}
@@ -165,19 +165,19 @@ await ProcessUtil.RunAsync($"{Options.DotnetRoot}/dotnet",
165165
}
166166
}
167167

168-
public bool InstallAspNetRefIfNeeded()
168+
public bool InstallAspNetRefIfNeeded()
169169
{
170-
try
170+
try
171171
{
172172
if (File.Exists(Options.AspNetRef))
173173
{
174174
var refPath = $"Microsoft.AspNetCore.App.Ref";
175175
Console.WriteLine($"Found AspNetRef: {Options.AspNetRef}, extracting to {refPath}");
176176
ZipFile.ExtractToDirectory(Options.AspNetRef, "Microsoft.AspNetCore.App.Ref");
177-
177+
178178
DisplayContents(refPath);
179179
}
180-
else
180+
else
181181
{
182182
Console.WriteLine($"No AspNetRef found: {Options.AspNetRef}, skipping...");
183183
}
@@ -189,15 +189,37 @@ public bool InstallAspNetRefIfNeeded()
189189
return false;
190190
}
191191
}
192-
192+
193+
public async Task<bool> InstallDotnetDump()
194+
{
195+
try
196+
{
197+
await ProcessUtil.RunAsync($"{Options.DotnetRoot}/dotnet",
198+
$"tool install dotnet-dump --tool-path {Options.HELIX_WORKITEM_ROOT} " +
199+
"--version 5.0.0-* --add-source https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet5/nuget/v3/index.json",
200+
environmentVariables: EnvironmentVariables,
201+
outputDataReceived: Console.WriteLine,
202+
errorDataReceived: Console.Error.WriteLine,
203+
throwOnError: false);
204+
205+
return true;
206+
}
207+
catch (Exception e)
208+
{
209+
Console.WriteLine($"Exception in InstallDotnetDump: {e}");
210+
return false;
211+
}
212+
}
213+
193214
public async Task<bool> CheckTestDiscoveryAsync()
194215
{
195216
try
196217
{
197218
// Run test discovery so we know if there are tests to run
198219
var discoveryResult = await ProcessUtil.RunAsync($"{Options.DotnetRoot}/dotnet",
199220
$"vstest {Options.Target} -lt",
200-
environmentVariables: EnvironmentVariables);
221+
environmentVariables: EnvironmentVariables,
222+
cancellationToken: new CancellationTokenSource(TimeSpan.FromMinutes(2)).Token);
201223

202224
if (discoveryResult.StandardOutput.Contains("Exception thrown"))
203225
{
@@ -217,8 +239,10 @@ public async Task<bool> CheckTestDiscoveryAsync()
217239
public async Task<int> RunTestsAsync()
218240
{
219241
var exitCode = 0;
220-
try
242+
try
221243
{
244+
// Timeout test run 5 minutes before the Helix job would timeout
245+
var cts = new CancellationTokenSource(Options.Timeout.Subtract(TimeSpan.FromMinutes(5)));
222246
var commonTestArgs = $"vstest {Options.Target} --logger:xunit --logger:\"console;verbosity=normal\" --blame";
223247
if (Options.Quarantined)
224248
{
@@ -230,7 +254,8 @@ public async Task<int> RunTestsAsync()
230254
environmentVariables: EnvironmentVariables,
231255
outputDataReceived: Console.WriteLine,
232256
errorDataReceived: Console.Error.WriteLine,
233-
throwOnError: false);
257+
throwOnError: false,
258+
cancellationToken: cts.Token);
234259

235260
if (result.ExitCode != 0)
236261
{
@@ -247,7 +272,8 @@ public async Task<int> RunTestsAsync()
247272
environmentVariables: EnvironmentVariables,
248273
outputDataReceived: Console.WriteLine,
249274
errorDataReceived: Console.Error.WriteLine,
250-
throwOnError: false);
275+
throwOnError: false,
276+
cancellationToken: cts.Token);
251277

252278
if (result.ExitCode != 0)
253279
{

eng/helix/content/runtests.cmd

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,18 @@ REM Need delayed expansion !PATH! so parens in the path don't mess up the parens
33
setlocal enabledelayedexpansion
44

55
REM Use '$' as a variable name prefix to avoid MSBuild variable collisions with these variables
6+
set $target=%1
67
set $sdkVersion=%2
78
set $runtimeVersion=%3
9+
set $queue=%4
810
set $arch=%5
11+
set $quarantined=%6
12+
set $ef=%7
13+
set $aspnetruntime=%8
14+
set $aspnetref=%9
15+
REM Batch only supports up to 9 arguments using the %# syntax, need to shift to get more
16+
shift
17+
set $helixTimeout=%9
918

1019
set DOTNET_HOME=%HELIX_CORRELATION_PAYLOAD%\sdk
1120
set DOTNET_ROOT=%DOTNET_HOME%\%$arch%
@@ -23,10 +32,11 @@ powershell.exe -NoProfile -ExecutionPolicy unrestricted -Command "[Net.ServicePo
2332
set exit_code=0
2433
echo "Restore: dotnet restore RunTests\RunTests.csproj --source https://api.nuget.org/v3/index.json --ignore-failed-sources..."
2534
dotnet restore RunTests\RunTests.csproj --source https://api.nuget.org/v3/index.json --ignore-failed-sources
26-
echo "Running tests: dotnet run --project RunTests\RunTests.csproj -- --target %1 --sdk %2 --runtime %3 --queue %4 --arch %5 --quarantined %6 --ef %7 --aspnetruntime %8 --aspnetref %9..."
27-
dotnet run --project RunTests\RunTests.csproj -- --target %1 --sdk %2 --runtime %3 --queue %4 --arch %5 --quarantined %6 --ef %7 --aspnetruntime %8 --aspnetref %9
28-
if errorlevel 1 (
29-
set exit_code=1
35+
36+
echo "Running tests: dotnet run --project RunTests\RunTests.csproj -- --target %$target% --sdk %$sdkVersion% --runtime %$runtimeVersion% --queue %$queue% --arch %$arch% --quarantined %$quarantined% --ef %$ef% --aspnetruntime %$aspnetruntime% --aspnetref %$aspnetref% --helixTimeout %$helixTimeout%..."
37+
dotnet run --project RunTests\RunTests.csproj -- --target %$target% --sdk %$sdkVersion% --runtime %$runtimeVersion% --queue %$queue% --arch %$arch% --quarantined %$quarantined% --ef %$ef% --aspnetruntime %$aspnetruntime% --aspnetref %$aspnetref% --helixTimeout %$helixTimeout%
38+
if errorlevel neq 0 (
39+
set exit_code=%errorlevel%
3040
)
3141
echo "Finished running tests: exit_code=%exit_code%"
3242
exit /b %exit_code%

eng/helix/content/runtests.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,8 @@ sync
8888
exit_code=0
8989
echo "Restore: $DOTNET_ROOT/dotnet restore RunTests/RunTests.csproj --source https://api.nuget.org/v3/index.json --ignore-failed-sources..."
9090
$DOTNET_ROOT/dotnet restore RunTests/RunTests.csproj --source https://api.nuget.org/v3/index.json --ignore-failed-sources
91-
echo "Running tests: $DOTNET_ROOT/dotnet run --project RunTests/RunTests.csproj -- --target $1 --sdk $2 --runtime $3 --queue $4 --arch $5 --quarantined $6 --ef $7 --aspnetruntime $8 --aspnetref $9..."
92-
$DOTNET_ROOT/dotnet run --project RunTests/RunTests.csproj -- --target $1 --sdk $2 --runtime $3 --queue $4 --arch $5 --quarantined $6 --ef $7 --aspnetruntime $8 --aspnetref $9
91+
echo "Running tests: $DOTNET_ROOT/dotnet run --project RunTests/RunTests.csproj -- --target $1 --sdk $2 --runtime $3 --queue $4 --arch $5 --quarantined $6 --ef $7 --aspnetruntime $8 --aspnetref $9 --helixTimeout ${10}..."
92+
$DOTNET_ROOT/dotnet run --project RunTests/RunTests.csproj -- --target $1 --sdk $2 --runtime $3 --queue $4 --arch $5 --quarantined $6 --ef $7 --aspnetruntime $8 --aspnetref $9 --helixTimeout ${10}
9393
exit_code=$?
9494
echo "Finished tests...exit_code=$exit_code"
9595
exit $exit_code

0 commit comments

Comments
 (0)