Skip to content

Commit e8d1e30

Browse files
authored
[LIT][E2E] Fix LIT hang when executing non-existent files (#17505)
Fixes #16351 In our containers if an exception is not immediately caught lit will hang on the following call https://github.com/intel/llvm/blob/4adaef0e444cf9a7e11f433b9995ece6bf9e0aa4/llvm/utils/lit/lit/run.py#L93 This can occur when using the internal lit shell and trying to run a program that does not exist. In this case `_executeShCmd` will throw an internal shell error, which will not be caught by the function directly calling it, `executeShCmd`, rather it is caught one function higher in the call stack in `executeScriptInternal`. Because that exception is percolated up the call stack instead of being immediately caught lit will hang. This patch changes the location where we catch this exception to `executeShCmd` instead to avoid this. Previously to avoid this we would use the external lit shell. However this introduces some differences in how we need to write tests (i.e., needing to add `--crash` for certain `not` calls), it slightly changes how test output is printed (all in one block, rather than separated by `RUN:` lines), and it messes up the path to executables when running on Windows (all `\` were interpreted as escapes for the next characters leading to trying to execute a non-existent file). This pr also changes the E2E tests to always use the internal lit shell. For more background on what causes this hang see: https://stackoverflow.com/questions/15314189/python-multiprocessing-pool-hangs-at-join https://bugs.python.org/issue9400 python/cpython#53646
1 parent 34c7c60 commit e8d1e30

File tree

5 files changed

+48
-16
lines changed

5 files changed

+48
-16
lines changed

llvm/utils/lit/lit/TestRunner.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,14 @@ def executeShCmd(cmd, shenv, results, timeout=0):
201201
timeoutHelper = TimeoutHelper(timeout)
202202
if timeout > 0:
203203
timeoutHelper.startTimer()
204-
finalExitCode = _executeShCmd(cmd, shenv, results, timeoutHelper)
204+
try:
205+
finalExitCode = _executeShCmd(cmd, shenv, results, timeoutHelper)
206+
except InternalShellError:
207+
e = sys.exc_info()[1]
208+
finalExitCode = 127
209+
results.append(
210+
ShellCommandResult(e.command, "", e.message, finalExitCode, False)
211+
)
205212
timeoutHelper.cancel()
206213
timeoutInfo = None
207214
if timeoutHelper.timeoutReached():
@@ -1105,15 +1112,10 @@ def executeScriptInternal(
11051112

11061113
results = []
11071114
timeoutInfo = None
1108-
try:
1109-
shenv = ShellEnvironment(cwd, test.config.environment)
1110-
exitCode, timeoutInfo = executeShCmd(
1111-
cmd, shenv, results, timeout=litConfig.maxIndividualTestTime
1112-
)
1113-
except InternalShellError:
1114-
e = sys.exc_info()[1]
1115-
exitCode = 127
1116-
results.append(ShellCommandResult(e.command, "", e.message, exitCode, False))
1115+
shenv = ShellEnvironment(cwd, test.config.environment)
1116+
exitCode, timeoutInfo = executeShCmd(
1117+
cmd, shenv, results, timeout=litConfig.maxIndividualTestTime
1118+
)
11171119

11181120
out = err = ""
11191121
for i, result in enumerate(results):
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import lit.formats
2+
3+
config.name = "timeout-hang"
4+
config.suffixes = [".txt"]
5+
config.test_format = lit.formats.ShTest()
6+
config.test_source_root = None
7+
config.test_exec_root = None
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
RUN: nonexistent

llvm/utils/lit/tests/timeout-hang.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# REQUIRES: lit-max-individual-test-time
2+
3+
# Python has some issues dealing with exceptions when multiprocessing,
4+
# which can cause hangs. Previously this could occur when we encountered
5+
# an internal shell exception, and had a timeout set.
6+
7+
# This test runs a lit test that tries to launch a non-existent file,
8+
# throwing an exception. We expect this to fail immediately, rather than
9+
# timeout.
10+
11+
# DEFINE: %{timeout}=1
12+
13+
# RUN: not %{lit} %{inputs}/timeout-hang/run-nonexistent.txt \
14+
# RUN: --timeout=%{timeout} --param external=0 | %{python} %s %{timeout}
15+
16+
import sys
17+
import re
18+
19+
timeout_time = float(sys.argv[1])
20+
testing_time = float(re.search(r"Testing Time: (.*)s", sys.stdin.read()).group(1))
21+
22+
if testing_time < timeout_time:
23+
print("Testing took less than timeout")
24+
sys.exit(0)
25+
else:
26+
print("Testing took as long or longer than timeout")
27+
sys.exit(1)

sycl/test-e2e/format.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -374,15 +374,10 @@ def get_extra_env(sycl_devices):
374374
recursion_limit=test.config.recursiveExpansionLimit,
375375
)
376376

377-
# TODO: workaround for lit hanging when executing non-existent binary
378-
# inside our containers
379377
if len(script) == 0:
380378
return lit.Test.Result(lit.Test.UNSUPPORTED, "Lit script is empty")
381-
useExternalSh = test.config.test_mode == "run-only"
382379

383-
result = lit.TestRunner._runShTest(
384-
test, litConfig, useExternalSh, script, tmpBase
385-
)
380+
result = lit.TestRunner._runShTest(test, litConfig, False, script, tmpBase)
386381

387382
# Single triple/device - might be an XFAIL.
388383
def map_result(features, code):

0 commit comments

Comments
 (0)