Skip to content

Commit 009b9f4

Browse files
authored
[lit] Fix lit hang on pool join when exception is thrown (#131881)
Fixes #133914 When using the internal shell with a timeout set lit will hang on the following call if an exception is thrown and not immediately caught https://github.com/llvm/llvm-project/blob/19970535f92c0f2dcda01b7fc60f95945166e424/llvm/utils/lit/lit/run.py#L93 This can occur when using the internal lit shell and trying to run a program that does not exist. In this case `_executeShCmd` will throw an internal shell error, which will not be caught by the function directly calling it, `executeShCmd`, rather it is caught one function higher in the call stack in `executeScriptInternal`. Because that exception is percolated up the call stack instead of being immediately caught lit will hang until the test timeout expires. This patch changes the location where we catch this exception to `executeShCmd` instead to avoid this. For more background on what causes this hang see: https://stackoverflow.com/questions/15314189/python-multiprocessing-pool-hangs-at-join
1 parent 573721b commit 009b9f4

File tree

4 files changed

+47
-10
lines changed

4 files changed

+47
-10
lines changed

llvm/utils/lit/lit/TestRunner.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,14 @@ def executeShCmd(cmd, shenv, results, timeout=0):
201201
timeoutHelper = TimeoutHelper(timeout)
202202
if timeout > 0:
203203
timeoutHelper.startTimer()
204-
finalExitCode = _executeShCmd(cmd, shenv, results, timeoutHelper)
204+
try:
205+
finalExitCode = _executeShCmd(cmd, shenv, results, timeoutHelper)
206+
except InternalShellError:
207+
e = sys.exc_info()[1]
208+
finalExitCode = 127
209+
results.append(
210+
ShellCommandResult(e.command, "", e.message, finalExitCode, False)
211+
)
205212
timeoutHelper.cancel()
206213
timeoutInfo = None
207214
if timeoutHelper.timeoutReached():
@@ -1105,15 +1112,10 @@ def executeScriptInternal(
11051112

11061113
results = []
11071114
timeoutInfo = None
1108-
try:
1109-
shenv = ShellEnvironment(cwd, test.config.environment)
1110-
exitCode, timeoutInfo = executeShCmd(
1111-
cmd, shenv, results, timeout=litConfig.maxIndividualTestTime
1112-
)
1113-
except InternalShellError:
1114-
e = sys.exc_info()[1]
1115-
exitCode = 127
1116-
results.append(ShellCommandResult(e.command, "", e.message, exitCode, False))
1115+
shenv = ShellEnvironment(cwd, test.config.environment)
1116+
exitCode, timeoutInfo = executeShCmd(
1117+
cmd, shenv, results, timeout=litConfig.maxIndividualTestTime
1118+
)
11171119

11181120
out = err = ""
11191121
for i, result in enumerate(results):
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import lit.formats
2+
3+
config.name = "timeout-hang"
4+
config.suffixes = [".txt"]
5+
config.test_format = lit.formats.ShTest()
6+
config.test_source_root = None
7+
config.test_exec_root = None
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
RUN: nonexistent

llvm/utils/lit/tests/timeout-hang.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# REQUIRES: lit-max-individual-test-time
2+
3+
# Python has some issues dealing with exceptions when multiprocessing,
4+
# which can cause hangs. Previously this could occur when we encountered
5+
# an internal shell exception, and had a timeout set.
6+
7+
# This test runs a lit test that tries to launch a non-existent file,
8+
# throwing an exception. We expect this to fail immediately, rather than
9+
# timeout.
10+
11+
# DEFINE: %{timeout}=1
12+
13+
# RUN: not %{lit} %{inputs}/timeout-hang/run-nonexistent.txt \
14+
# RUN: --timeout=%{timeout} --param external=0 | %{python} %s %{timeout}
15+
16+
import sys
17+
import re
18+
19+
timeout_time = float(sys.argv[1])
20+
testing_time = float(re.search(r"Testing Time: (.*)s", sys.stdin.read()).group(1))
21+
22+
if testing_time < timeout_time:
23+
print("Testing took less than timeout")
24+
sys.exit(0)
25+
else:
26+
print("Testing took as long or longer than timeout")
27+
sys.exit(1)

0 commit comments

Comments
 (0)