Skip to content

Commit b966e44

Browse files
committed
CONSOLIDATED COMMITS: unit test skips and unskips
================================================= Temporarily skip test_conv3d_64bit_indexing - Rocblas API support is requested - SWDEV-383635 & sub task - SWDEV-390218 Skip ddp apply_optim_in_bwd tests for gloo (#1302) To resolve https://ontrack-internal.amd.com/browse/SWDEV-403530 and https://ontrack-internal.amd.com/browse/SWDEV-419837. For more context check upstream issue pytorch#111834 Add skipIfRocmArch decorator for Navi skips (#1356) Converted NAVI check as a function (#1364) * Moved NAVI check to the test file * Revised NAVI check as a function [Navi] [Inductor] Unskip Navi inductor UTs (#1514) Relates to https://ontrack-internal.amd.com/browse/SWDEV-461590 Bad import in test_torchinductor and skip torchvision related UT (#1374) skip test_inductor_freezing failing UTs (#1375) Skip test_mm_triton_kernel_benchmark (#1376) * Running triton kernel on ROCM only has one GB/s metric reported * Update test_kernel_benchmark.py skip vmapvjpvjp_linalg_householder_product_cuda_float32 (#1420) skipIfRocm needs msg parameter [NO CP] Updated changes to skip few UTs Imported skipIfRocm in certain test suites (#1577) Fixes SWDEV-472397 Added functions imports (#1521) Fixes inductor.test_torchinductor_dynamic_shapes::TestInductorDynamicCUDA::test_item_unbacked_stride_nobreak_cuda Enable test_public_api_surface (#1601) Fixes SWDEV-462410. Enable this unit test since PyTorch issue pytorch#104012 has been closed. This unit test runs fine on MI100/MI300 and upstream. (cherry picked from commit 0001d4ab5070635cfecc146ee299bbb9fa45ca67) [rocm6.3_internal_testing] Fixed error string assertion in test_invalid_devices (#1607) Fixes pytorch#8974 (cherry picked from commit a688e0a)
1 parent 51ce1cc commit b966e44

13 files changed

+62
-11
lines changed

test/dynamo/test_structured_trace.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from torch._inductor.test_case import TestCase
2121
from torch._logging._internal import TorchLogsFormatter
2222
from torch.nn.parallel import DistributedDataParallel as DDP
23-
from torch.testing._internal.common_utils import find_free_port
23+
from torch.testing._internal.common_utils import find_free_port, skipIfRocm
2424
from torch.testing._internal.inductor_utils import HAS_CUDA
2525

2626

@@ -237,6 +237,7 @@ def test_schedule(self):
237237
self.assertParses()
238238

239239
@requires_cuda
240+
@skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
240241
def test_cudagraphs(self):
241242
fn_opt = torch.compile(mode="reduce-overhead")(inductor_schedule_fn)
242243
fn_opt(torch.ones(1000, 1000, device="cuda"))

test/functorch/test_ops.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -951,7 +951,7 @@ def fn(inp, *args, **kwargs):
951951
# (3) encountering this error in PyTorch internals.
952952
xfail("index_reduce", "prod"),
953953
decorate(
954-
"linalg.householder_product", decorator=runOnRocm
954+
"linalg.householder_product", decorator=skipIfRocm
955955
), # works on ROCm
956956
xfail(
957957
# nans

test/inductor/test_cuda_repro.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
freeze_rng_state,
3737
IS_FBCODE,
3838
skipIfRocm,
39+
skipIfRocmArch,
3940
TEST_WITH_ASAN,
4041
)
4142

@@ -62,7 +63,7 @@
6263
sys.exit(0)
6364
raise
6465

65-
66+
NAVI_ARCH = ("gfx1100", "gfx1101") # Used for navi exclusive skips on ROCm
6667
TestCase = test_torchinductor.TestCase
6768
ToTuple = test_torchinductor.ToTuple
6869
check_model_cuda = test_torchinductor.check_model_cuda

test/inductor/test_inductor_freezing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
check_model_gpu,
2929
copy_tests,
3030
)
31-
from torch.testing._internal.common_utils import TEST_WITH_ASAN, TEST_WITH_ROCM
31+
from torch.testing._internal.common_utils import TEST_WITH_ASAN, TEST_WITH_ROCM, skipIfRocm
3232

3333

3434
importlib.import_module("functorch")

test/inductor/test_kernel_benchmark.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from torch.testing._internal.common_cuda import xfailIfSM89
1818
from torch.testing._internal.common_device_type import expectedFailureXPU
1919
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
20-
20+
from torch.testing._internal.common_utils import skipIfRocm
2121

2222
class TestKernelBenchmark(TestCase):
2323
device_type = GPU_TYPE
@@ -153,6 +153,7 @@ def f(a, b):
153153
@expectedFailureXPU
154154
@config.patch(max_autotune=True, max_autotune_gemm_backends="TRITON")
155155
@fresh_inductor_cache()
156+
@skipIfRocm #This seems to be disabled upstream https://github.com/pytorch/pytorch/issues/118346
156157
def test_mm_triton_kernel_benchmark(self):
157158
M = 2048
158159
N = 2432

test/inductor/test_torchinductor.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
from torch.testing._internal.common_device_type import (
7373
_has_sufficient_memory,
7474
expectedFailureXPU,
75+
get_desired_device_type_test_bases,
7576
)
7677
from torch.testing._internal.common_dtype import all_types, get_all_dtypes
7778
from torch.testing._internal.common_quantization import (
@@ -91,6 +92,8 @@
9192
skipIfWindows,
9293
skipIfXpu,
9394
subtest,
95+
skipIfRocmArch,
96+
subtest,
9497
TEST_WITH_ASAN,
9598
TEST_WITH_ROCM,
9699
xfailIfS390X,
@@ -126,6 +129,10 @@
126129

127130

128131
HAS_AVX2 = "fbgemm" in torch.backends.quantized.supported_engines
132+
_desired_test_bases = get_desired_device_type_test_bases()
133+
RUN_CPU = any(getattr(x, "device_type", "") == "cpu" for x in _desired_test_bases)
134+
RUN_GPU = any(getattr(x, "device_type", "") == GPU_TYPE for x in _desired_test_bases)
135+
NAVI_ARCH = ("gfx1100", "gfx1101") # Used for navi exclusive skips on ROCm
129136

130137
aten = torch.ops.aten
131138

@@ -7178,6 +7185,8 @@ def fn(in_ptr0, in_ptr1, in_ptr2):
71787185
),
71797186
)
71807187

7188+
@skipIfWindows
7189+
@skipIfRocm
71817190
def test_roi_align(self):
71827191
if not has_torchvision_roi_align():
71837192
raise unittest.SkipTest("requires torchvision")
@@ -8030,6 +8039,7 @@ def fn(a, dim, index, b, reduce):
80308039
)
80318040

80328041
@skip_if_gpu_halide
8042+
# issue #1150
80338043
def test_dense_mask_index(self):
80348044
r"""
80358045
There will be a little difference for reduce order between aten and inductor

test/inductor/test_torchinductor_dynamic_shapes.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
TEST_CUDA_MEM_LEAK_CHECK,
3434
TEST_WITH_ASAN,
3535
TEST_WITH_ROCM,
36+
skipIfRocm,
3637
)
3738
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU
3839

@@ -242,6 +243,7 @@ def fn(x, y):
242243
self.assertEqual(r, opt_r)
243244

244245
@torch._dynamo.config.patch(capture_scalar_outputs=True)
246+
@skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
245247
def test_unwrap_storage_didnt_work_repro(self, device):
246248
def f():
247249
full = torch.full((), 11)

test/nn/test_convolution.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
parametrize as parametrize_test,
5858
run_tests,
5959
set_default_dtype,
60+
skipIfRocm,
6061
skipIfNotMiopenSuggestNHWC,
6162
skipIfRocmVersionLessThan,
6263
subtest,
@@ -4060,8 +4061,10 @@ def test_conv_double_backward_strided_with_3D_input_and_weight(self, device):
40604061
self.assertEqual(grad_weight.shape, weight.shape)
40614062

40624063
@onlyCUDA
4063-
@largeTensorTest("40GB")
4064-
@largeTensorTest("24GB", "cpu")
4064+
@largeTensorTest('40GB')
4065+
@largeTensorTest('24GB', 'cpu')
4066+
# Skipped for ROCm temp - https://ontrack-internal.amd.com/browse/SWDEV-383635
4067+
@skipIfRocm
40654068
def test_conv3d_64bit_indexing(self, device):
40664069
x = torch.rand(1, 32, 512, 512, 256)
40674070
m = torch.nn.Conv3d(32, 1, kernel_size=1, padding=0, stride=1, bias=False)

test/run_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,9 @@ def __contains__(self, item):
185185
"distributed/_tensor/test_attention",
186186
]
187187

188+
if sys.version_info.major < 3 or (sys.version_info.major == 3 and sys.version_info.minor <= 9):
189+
ROCM_BLOCKLIST.append("test_typing")
190+
188191
# whitelist of tests for s390x
189192
S390X_TESTLIST = [
190193
"backends/xeon/test_launch.py",

test/test_cuda.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1959,9 +1959,8 @@ def test_graph_capture_oom(self):
19591959
with torch.cuda.graph(torch.cuda.CUDAGraph()):
19601960
torch.zeros(2**40, device="cuda")
19611961

1962-
@unittest.skipIf(
1963-
not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1964-
)
1962+
@unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
1963+
@skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
19651964
@serialTest()
19661965
@setBlasBackendsToDefaultFinally
19671966
def test_repeat_graph_capture_cublas_workspace_memory(self):
@@ -2918,6 +2917,7 @@ def forward(self, input_dict: dict):
29182917
@unittest.skipIf(
29192918
not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
29202919
)
2920+
@skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
29212921
def test_graph_make_graphed_callables_same_pool(self):
29222922
torch.manual_seed(5)
29232923
torch.cuda.manual_seed(5)

test/test_fx.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
IS_WINDOWS,
5959
find_library_location,
6060
run_tests,
61+
skipIfRocm,
6162
skipIfTorchDynamo,
6263
)
6364
from torch.testing._internal.jit_utils import JitTestCase

torch/testing/_internal/common_utils.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1360,6 +1360,14 @@ def printErrors(self) -> None:
13601360
IS_ARM64 = platform.machine() in ('arm64', 'aarch64')
13611361
IS_S390X = platform.machine() == "s390x"
13621362

1363+
def is_navi_arch():
1364+
if torch.cuda.is_available():
1365+
prop = torch.cuda.get_device_properties(0)
1366+
gfx_arch = prop.gcnArchName.split(":")[0]
1367+
if gfx_arch in ["gfx1100", "gfx1101", "gfx1102"]:
1368+
return True
1369+
return False
1370+
13631371
def is_avx512_vnni_supported():
13641372
if sys.platform != 'linux':
13651373
return False
@@ -1849,6 +1857,19 @@ def wrapper(*args, **kwargs):
18491857
return dec_fn(func)
18501858
return dec_fn
18511859

1860+
def skipIfRocmArch(arch: Tuple[str, ...]):
1861+
def dec_fn(fn):
1862+
@wraps(fn)
1863+
def wrap_fn(self, *args, **kwargs):
1864+
if TEST_WITH_ROCM:
1865+
prop = torch.cuda.get_device_properties(0)
1866+
if prop.gcnArchName.split(":")[0] in arch:
1867+
reason = f"skipIfRocm: test skipped on {arch}"
1868+
raise unittest.SkipTest(reason)
1869+
return fn(self, *args, **kwargs)
1870+
return wrap_fn
1871+
return dec_fn
1872+
18521873
def runOnRocm(fn):
18531874
@wraps(fn)
18541875
def wrapper(*args, **kwargs):

torch/testing/_internal/distributed/distributed_test.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4817,7 +4817,11 @@ def _test_ddp_apply_optim_in_backward(
48174817
# set_to_none for regular optimizer to match in backward
48184818
# case.
48194819
optim.zero_grad(set_to_none=True)
4820-
4820+
4821+
@skip_but_pass_in_sandcastle_if(
4822+
BACKEND == "gloo" and HAS_TORCHVISION,
4823+
"Failing with gloo backend + torchvision due to ongoing issue https://github.com/pytorch/pytorch/issues/111834",
4824+
)
48214825
@skip_if_lt_x_gpu(2)
48224826
def test_ddp_apply_optim_in_backward(self):
48234827
for optim_cls, init_before in itertools.product(
@@ -4830,6 +4834,10 @@ def test_ddp_apply_optim_in_backward(self):
48304834
init_before=init_before,
48314835
)
48324836

4837+
@skip_but_pass_in_sandcastle_if(
4838+
BACKEND == "gloo" and HAS_TORCHVISION,
4839+
"Failing with gloo backend + torchvision due to ongoing issue https://github.com/pytorch/pytorch/issues/111834",
4840+
)
48334841
@skip_if_lt_x_gpu(2)
48344842
def test_ddp_apply_optim_in_backward_grad_as_bucket_view_false(self):
48354843
for init_before in [True, False]:

0 commit comments

Comments
 (0)