Skip to content

Capture dump of hanging test process in Helix #21659

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
May 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 63 additions & 15 deletions eng/helix/content/RunTests/ProcessUtil.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Runtime.InteropServices;
using System.Text;
using System.Threading;
Expand All @@ -19,10 +20,53 @@ public static class ProcessUtil
[DllImport("libc", SetLastError = true, EntryPoint = "kill")]
private static extern int sys_kill(int pid, int sig);

public static Task CaptureDumpAsync()
{
var dumpDirectoryPath = Environment.GetEnvironmentVariable("HELIX_DUMP_FOLDER");

if (dumpDirectoryPath == null)
{
return Task.CompletedTask;
}

var process = Process.GetCurrentProcess();
var dumpFilePath = Path.Combine(dumpDirectoryPath, $"{process.ProcessName}-{process.Id}.dmp");

return CaptureDumpAsync(process.Id, dumpFilePath);
}

public static Task CaptureDumpAsync(int pid)
{
var dumpDirectoryPath = Environment.GetEnvironmentVariable("HELIX_DUMP_FOLDER");

if (dumpDirectoryPath == null)
{
return Task.CompletedTask;
}

var process = Process.GetProcessById(pid);
var dumpFilePath = Path.Combine(dumpDirectoryPath, $"{process.ProcessName}.{process.Id}.dmp");

return CaptureDumpAsync(process.Id, dumpFilePath);
}

public static Task CaptureDumpAsync(int pid, string dumpFilePath)
{
// Skip this on OSX, we know it's unsupported right now
if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
{
// Can we capture stacks or do a gcdump instead?
return Task.CompletedTask;
}

return RunAsync($"{Environment.GetEnvironmentVariable("HELIX_WORKITEM_ROOT")}/dotnet-dump", $"collect -p {pid} -o \"{dumpFilePath}\"");
}

public static async Task<ProcessResult> RunAsync(
string filename,
string arguments,
string? workingDirectory = null,
string dumpDirectoryPath = null,
bool throwOnError = true,
IDictionary<string, string?>? environmentVariables = null,
Action<string>? outputDataReceived = null,
Expand Down Expand Up @@ -51,6 +95,14 @@ public static async Task<ProcessResult> RunAsync(
process.StartInfo.WorkingDirectory = workingDirectory;
}

dumpDirectoryPath ??= Environment.GetEnvironmentVariable("HELIX_DUMP_FOLDER");

if (dumpDirectoryPath != null)
{
process.StartInfo.EnvironmentVariables["COMPlus_DbgEnableMiniDump"] = "1";
process.StartInfo.EnvironmentVariables["COMPlus_DbgMiniDumpName"] = Path.Combine(dumpDirectoryPath, $"{Path.GetFileName(filename)}.%d.dmp");
}

if (environmentVariables != null)
{
foreach (var kvp in environmentVariables)
Expand Down Expand Up @@ -112,13 +164,20 @@ public static async Task<ProcessResult> RunAsync(
process.BeginOutputReadLine();
process.BeginErrorReadLine();

var cancelledTcs = new TaskCompletionSource<object?>();
await using var _ = cancellationToken.Register(() => cancelledTcs.TrySetResult(null));
var canceledTcs = new TaskCompletionSource<object?>();
await using var _ = cancellationToken.Register(() => canceledTcs.TrySetResult(null));

var result = await Task.WhenAny(processLifetimeTask.Task, cancelledTcs.Task);
var result = await Task.WhenAny(processLifetimeTask.Task, canceledTcs.Task);

if (result == cancelledTcs.Task)
if (result == canceledTcs.Task)
{
if (dumpDirectoryPath != null)
{
var dumpFilePath = Path.Combine(dumpDirectoryPath, $"{Path.GetFileName(filename)}.{process.Id}.dmp");
// Capture a process dump if the dumpDirectory is set
await CaptureDumpAsync(process.Id, dumpFilePath);
}

if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
{
sys_kill(process.Id, sig: 2); // SIGINT
Expand All @@ -143,16 +202,5 @@ public static async Task<ProcessResult> RunAsync(

return await processLifetimeTask.Task;
}

public static void KillProcess(int pid)
{
try
{
using var process = Process.GetProcessById(pid);
process?.Kill();
}
catch (ArgumentException) { }
catch (InvalidOperationException) { }
}
}
}
10 changes: 5 additions & 5 deletions eng/helix/content/RunTests/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System;
using System.Collections.Generic;
using System.CommandLine;
using System.IO;
using System.Runtime.InteropServices;
using System.Threading.Tasks;

namespace RunTests
Expand All @@ -14,7 +10,7 @@ class Program
{
static async Task Main(string[] args)
{
try
try
{
var runner = new TestRunner(RunTestsOptions.Parse(args));

Expand All @@ -27,6 +23,10 @@ static async Task Main(string[] args)
{
keepGoing = runner.InstallAspNetRefIfNeeded();
}
if (keepGoing)
{
keepGoing = await runner.InstallDotnetDump();
}

runner.DisplayContents();

Expand Down
11 changes: 9 additions & 2 deletions eng/helix/content/RunTests/RunTestsOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,21 @@ public static RunTestsOptions Parse(string[] args)
aliases: new string[] { "--ef" },
description: "The version of the EF tool to use")
{ Argument = new Argument<string>(), Required = true },

new Option(
aliases: new string[] { "--aspnetruntime" },
description: "The path to the aspnet runtime nupkg to install")
{ Argument = new Argument<string>(), Required = true },

new Option(
aliases: new string[] { "--aspnetref" },
description: "The path to the aspnet ref nupkg to install")
{ Argument = new Argument<string>(), Required = true },

new Option(
aliases: new string[] { "--helixTimeout" },
description: "The timeout duration of the Helix job")
{ Argument = new Argument<string>(), Required = true },
};

var parseResult = command.Parse(args);
Expand All @@ -73,6 +78,7 @@ public static RunTestsOptions Parse(string[] args)
options.EfVersion = parseResult.ValueForOption<string>("--ef");
options.AspNetRuntime = parseResult.ValueForOption<string>("--aspnetruntime");
options.AspNetRef = parseResult.ValueForOption<string>("--aspnetref");
options.Timeout = TimeSpan.Parse(parseResult.ValueForOption<string>("--helixTimeout"));
options.HELIX_WORKITEM_ROOT = Environment.GetEnvironmentVariable("HELIX_WORKITEM_ROOT");
options.Path = Environment.GetEnvironmentVariable("PATH");
options.DotnetRoot = Environment.GetEnvironmentVariable("DOTNET_ROOT");
Expand All @@ -91,5 +97,6 @@ public static RunTestsOptions Parse(string[] args)
public string HELIX_WORKITEM_ROOT { get; set;}
public string DotnetRoot { get; set; }
public string Path { get; set; }
public TimeSpan Timeout { get; set; }
}
}
60 changes: 43 additions & 17 deletions eng/helix/content/RunTests/TestRunner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@

using System;
using System.Collections.Generic;
using System.CommandLine;
using System.IO;
using System.IO.Compression;
using System.Runtime.InteropServices;
using System.Threading;
using System.Threading.Tasks;

namespace RunTests
Expand All @@ -24,15 +24,15 @@ public TestRunner(RunTestsOptions options)

public bool SetupEnvironment()
{
try
try
{
// Rename default.NuGet.config to NuGet.config if there is not a custom one from the project
// We use a local NuGet.config file to avoid polluting global machine state and avoid relying on global machine state
if (!File.Exists("NuGet.config"))
{
File.Copy("default.NuGet.config", "NuGet.config");
}

EnvironmentVariables.Add("PATH", Options.Path);
EnvironmentVariables.Add("DOTNET_ROOT", Options.DotnetRoot);
EnvironmentVariables.Add("helix", Options.HelixQueue);
Expand Down Expand Up @@ -68,7 +68,7 @@ public bool SetupEnvironment()

public void DisplayContents(string path = "./")
{
try
try
{
Console.WriteLine();
Console.WriteLine($"Displaying directory contents for {path}:");
Expand All @@ -88,9 +88,9 @@ public void DisplayContents(string path = "./")
}
}

public async Task<bool> InstallAspNetAppIfNeededAsync()
public async Task<bool> InstallAspNetAppIfNeededAsync()
{
try
try
{
if (File.Exists(Options.AspNetRuntime))
{
Expand All @@ -113,7 +113,7 @@ public async Task<bool> InstallAspNetAppIfNeededAsync()
}
}
}

DisplayContents(appRuntimePath);

Console.WriteLine($"Adding current directory to nuget sources: {Options.HELIX_WORKITEM_ROOT}");
Expand Down Expand Up @@ -148,7 +148,7 @@ await ProcessUtil.RunAsync($"{Options.DotnetRoot}/dotnet",
Options.Path += $"{Environment.GetEnvironmentVariable("DOTNET_CLI_HOME")}/.dotnet/tools";
EnvironmentVariables["PATH"] = Options.Path;
}
else
else
{
Console.WriteLine($"No AspNetRuntime found: {Options.AspNetRuntime}, skipping...");
}
Expand All @@ -161,19 +161,19 @@ await ProcessUtil.RunAsync($"{Options.DotnetRoot}/dotnet",
}
}

public bool InstallAspNetRefIfNeeded()
public bool InstallAspNetRefIfNeeded()
{
try
try
{
if (File.Exists(Options.AspNetRef))
{
var refPath = $"Microsoft.AspNetCore.App.Ref";
Console.WriteLine($"Found AspNetRef: {Options.AspNetRef}, extracting to {refPath}");
ZipFile.ExtractToDirectory(Options.AspNetRef, "Microsoft.AspNetCore.App.Ref");

DisplayContents(refPath);
}
else
else
{
Console.WriteLine($"No AspNetRef found: {Options.AspNetRef}, skipping...");
}
Expand All @@ -185,15 +185,37 @@ public bool InstallAspNetRefIfNeeded()
return false;
}
}


public async Task<bool> InstallDotnetDump()
{
try
{
await ProcessUtil.RunAsync($"{Options.DotnetRoot}/dotnet",
$"tool install dotnet-dump --tool-path {Options.HELIX_WORKITEM_ROOT} " +
"--version 5.0.0-* --add-source https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet5/nuget/v3/index.json",
environmentVariables: EnvironmentVariables,
outputDataReceived: Console.WriteLine,
errorDataReceived: Console.Error.WriteLine,
throwOnError: false);

return true;
}
catch (Exception e)
{
Console.WriteLine($"Exception in InstallDotnetDump: {e}");
return false;
}
}

public async Task<bool> CheckTestDiscoveryAsync()
{
try
{
// Run test discovery so we know if there are tests to run
var discoveryResult = await ProcessUtil.RunAsync($"{Options.DotnetRoot}/dotnet",
$"vstest {Options.Target} -lt",
environmentVariables: EnvironmentVariables);
environmentVariables: EnvironmentVariables,
cancellationToken: new CancellationTokenSource(TimeSpan.FromMinutes(2)).Token);

if (discoveryResult.StandardOutput.Contains("Exception thrown"))
{
Expand All @@ -213,8 +235,10 @@ public async Task<bool> CheckTestDiscoveryAsync()
public async Task<int> RunTestsAsync()
{
var exitCode = 0;
try
try
{
// Timeout test run 5 minutes before the Helix job would timeout
var cts = new CancellationTokenSource(Options.Timeout.Subtract(TimeSpan.FromMinutes(5)));
var commonTestArgs = $"vstest {Options.Target} --logger:xunit --logger:\"console;verbosity=normal\" --blame";
if (Options.Quarantined)
{
Expand All @@ -226,7 +250,8 @@ public async Task<int> RunTestsAsync()
environmentVariables: EnvironmentVariables,
outputDataReceived: Console.WriteLine,
errorDataReceived: Console.Error.WriteLine,
throwOnError: false);
throwOnError: false,
cancellationToken: cts.Token);

if (result.ExitCode != 0)
{
Expand All @@ -243,7 +268,8 @@ public async Task<int> RunTestsAsync()
environmentVariables: EnvironmentVariables,
outputDataReceived: Console.WriteLine,
errorDataReceived: Console.Error.WriteLine,
throwOnError: false);
throwOnError: false,
cancellationToken: cts.Token);

if (result.ExitCode != 0)
{
Expand Down
18 changes: 14 additions & 4 deletions eng/helix/content/runtests.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,18 @@ REM Need delayed expansion !PATH! so parens in the path don't mess up the parens
setlocal enabledelayedexpansion

REM Use '$' as a variable name prefix to avoid MSBuild variable collisions with these variables
set $target=%1
set $sdkVersion=%2
set $runtimeVersion=%3
set $queue=%4
set $arch=%5
set $quarantined=%6
set $ef=%7
set $aspnetruntime=%8
set $aspnetref=%9
REM Batch only supports up to 9 arguments using the %# syntax, need to shift to get more
shift
set $helixTimeout=%9
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is starting to get a bit out of control, should we consider a better way to flow things from our build to the helix test runner, idk what config file, environment variables? Just seems like we are going to constantly keep adding arguments over time otherwise... not something we have to do as part of this PR, but worth thinking about now maybe

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed


set DOTNET_HOME=%HELIX_CORRELATION_PAYLOAD%\sdk
set DOTNET_ROOT=%DOTNET_HOME%\%$arch%
Expand All @@ -23,10 +32,11 @@ powershell.exe -NoProfile -ExecutionPolicy unrestricted -Command "[Net.ServicePo
set exit_code=0
echo "Restore: dotnet restore RunTests\RunTests.csproj --source https://api.nuget.org/v3/index.json --ignore-failed-sources..."
dotnet restore RunTests\RunTests.csproj --source https://api.nuget.org/v3/index.json --ignore-failed-sources
echo "Running tests: dotnet run --project RunTests\RunTests.csproj -- --target %1 --sdk %2 --runtime %3 --queue %4 --arch %5 --quarantined %6 --ef %7 --aspnetruntime %8 --aspnetref %9..."
dotnet run --project RunTests\RunTests.csproj -- --target %1 --sdk %2 --runtime %3 --queue %4 --arch %5 --quarantined %6 --ef %7 --aspnetruntime %8 --aspnetref %9
if errorlevel 1 (
set exit_code=1

echo "Running tests: dotnet run --project RunTests\RunTests.csproj -- --target %$target% --sdk %$sdkVersion% --runtime %$runtimeVersion% --queue %$queue% --arch %$arch% --quarantined %$quarantined% --ef %$ef% --aspnetruntime %$aspnetruntime% --aspnetref %$aspnetref% --helixTimeout %$helixTimeout%..."
dotnet run --project RunTests\RunTests.csproj -- --target %$target% --sdk %$sdkVersion% --runtime %$runtimeVersion% --queue %$queue% --arch %$arch% --quarantined %$quarantined% --ef %$ef% --aspnetruntime %$aspnetruntime% --aspnetref %$aspnetref% --helixTimeout %$helixTimeout%
if errorlevel neq 0 (
set exit_code=%errorlevel%
)
echo "Finished running tests: exit_code=%exit_code%"
exit /b %exit_code%
4 changes: 2 additions & 2 deletions eng/helix/content/runtests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ sync
exit_code=0
echo "Restore: $DOTNET_ROOT/dotnet restore RunTests/RunTests.csproj --source https://api.nuget.org/v3/index.json --ignore-failed-sources..."
$DOTNET_ROOT/dotnet restore RunTests/RunTests.csproj --source https://api.nuget.org/v3/index.json --ignore-failed-sources
echo "Running tests: $DOTNET_ROOT/dotnet run --project RunTests/RunTests.csproj -- --target $1 --sdk $2 --runtime $3 --queue $4 --arch $5 --quarantined $6 --ef $7 --aspnetruntime $8 --aspnetref $9..."
$DOTNET_ROOT/dotnet run --project RunTests/RunTests.csproj -- --target $1 --sdk $2 --runtime $3 --queue $4 --arch $5 --quarantined $6 --ef $7 --aspnetruntime $8 --aspnetref $9
echo "Running tests: $DOTNET_ROOT/dotnet run --project RunTests/RunTests.csproj -- --target $1 --sdk $2 --runtime $3 --queue $4 --arch $5 --quarantined $6 --ef $7 --aspnetruntime $8 --aspnetref $9 --helixTimeout ${10}..."
$DOTNET_ROOT/dotnet run --project RunTests/RunTests.csproj -- --target $1 --sdk $2 --runtime $3 --queue $4 --arch $5 --quarantined $6 --ef $7 --aspnetruntime $8 --aspnetref $9 --helixTimeout ${10}
exit_code=$?
echo "Finished tests...exit_code=$exit_code"
exit $exit_code
Loading