Add skipIfRocmArch decorator for Navi skips (#1356)

jataylo · dnikolaev-amd · commit 9726c26366d0 · 2024-06-20T14:32:14.000Z
diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py
@@ -27,6 +27,7 @@
     freeze_rng_state,
     IS_FBCODE,
     skipIfRocm,
+    skipIfRocmArch,
     TEST_WITH_ASAN,
 )
 
@@ -48,7 +49,7 @@
         sys.exit(0)
     raise
 
-
+NAVI_ARCH = ("gfx1100", "gfx1101") # Used for navi exclusive skips on ROCm
 TestCase = test_torchinductor.TestCase
 ToTuple = test_torchinductor.ToTuple
 check_model_cuda = test_torchinductor.check_model_cuda
@@ -313,6 +314,7 @@ def foo(x):
         out_ref.add_(2)
         # self.assertEqual(out_ref, out)
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_accuracy_issue1(self):
         class Repro(torch.nn.Module):
             def __init__(self):
@@ -349,6 +351,7 @@ def forward(self, start_positions: torch.Tensor, x: torch.Tensor):
             assert same_two_models(mod, opt_mod, args), "Dynamo failed"
 
     @config.patch(allow_buffer_reuse=False)
+    @skipIfRocmArch(NAVI_ARCH)
     def test_issue103461(self):
         def forward(add_1):
             var_mean = torch.ops.aten.var_mean.correction(
@@ -832,6 +835,7 @@ def forward(self, x):
             res2 = jit_func(x)
             self.assertEqual(res1, res2)
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_issue103481(self):
         def fn(x, y):
             # NOTE: 6 dimensions is important! does not fail for 5 dimensions
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -87,6 +87,7 @@
     skipIfRocm,
     skipIfXpu,
     subtest,
+    skipIfRocmArch,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
 )
@@ -129,6 +130,10 @@
 )
 
 HAS_AVX2 = "fbgemm" in torch.backends.quantized.supported_engines
+_desired_test_bases = get_desired_device_type_test_bases()
+RUN_CPU = any(getattr(x, "device_type", "") == "cpu" for x in _desired_test_bases)
+RUN_GPU = any(getattr(x, "device_type", "") == GPU_TYPE for x in _desired_test_bases)
+NAVI_ARCH = ("gfx1100", "gfx1101") # Used for navi exclusive skips on ROCm
 
 aten = torch.ops.aten
 
@@ -1648,6 +1653,7 @@ def fn(x):
             # make sure things also work if they aren't unrolled
             self.common(fn, (torch.randn(8, 3),))
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_multilayer_sum_low_prec(self):
         # fp16 nyi for cpu
         if self.device == "cpu":
@@ -1658,6 +1664,7 @@ def fn(a):
 
         self.common(fn, ((torch.rand((10, 3, 352, 352), dtype=torch.float16),)))
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_multilayer_prime_size(self):
         def fn(a):
             return torch.max(a), torch.sum(a)
@@ -1668,6 +1675,7 @@ def fn(a):
         self.common(fn, (sample,))
 
     @skipCPUIf(IS_MACOS, "fails on macos")
+    @skipIfRocmArch(NAVI_ARCH)
     def test_multilayer_var(self):
         def fn(a):
             return torch.var(a)
@@ -2667,6 +2675,7 @@ def fn(a, b):
 
         self.common(fn, (torch.randn(8, 8), torch.randn(8, 8)))
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_large_tensor_reduction(self):
         if not _has_sufficient_memory(self.device, 4.5 * 1024**3):  # 4.5 GiB
             raise unittest.SkipTest("insufficient memory")
@@ -2687,6 +2696,7 @@ def fn(a):
         expect = torch.tensor(2, dtype=torch.int8, device=self.device)
         self.assertEqual(actual, expect)
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_large_broadcast_reduction(self):
         if self.device == "cpu":
             raise unittest.SkipTest("Fails on CPU")
@@ -3761,6 +3771,7 @@ def test_conv2d_channels_last(self):
             check_lowp=False,
         )
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_conv2d_backward_channels_last(self):
         def fn(grad_output, inp, weight):
             convolution_backward_8 = torch.ops.aten.convolution_backward.default(
@@ -4520,6 +4531,7 @@ def fn(x, y):
         self.assertEqual(a.stride(), c.stride())
         self.assertEqual(c.stride()[2], 1)
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_std(self):
         def fn(x):
             return (
@@ -4562,6 +4574,7 @@ def test_batch_norm_2d(self):
 
     # From yolov3
     @with_tf32_off
+    @skipIfRocmArch(NAVI_ARCH)
     def test_batch_norm_2d_2(self):
         if self.device == "cpu":
             raise unittest.SkipTest(f"requires {GPU_TYPE}")
@@ -4697,6 +4710,7 @@ def fn(x):
 
         self.common(fn, (x,))
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_cauchy(self):
         def fn(x, y):
             return torch.sum(1 / (torch.unsqueeze(x, -1) - y))
@@ -6034,6 +6048,7 @@ def fn(a):
         y = fn_compiled(x)
         self.assertTrue(y is not x)
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_l1_loss(self):
         def fn(a, b):
             return torch.nn.functional.l1_loss(a, b), torch.nn.functional.mse_loss(a, b)
@@ -6430,6 +6445,7 @@ def fn(x):
             fn, (torch.tensor([1, float("inf"), 2, float("-inf"), float("nan")]),)
         )
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_any(self):
         def fn(x):
             return (
@@ -7177,6 +7193,8 @@ def fn(a, dim, index, b, reduce):
                 check_lowp=check_lowp,
             )
 
+    # issue #1150
+    @skipIfRocmArch(NAVI_ARCH)
     def test_dense_mask_index(self):
         r"""
         There will be a little difference for reduce order between aten and inductor
@@ -8152,6 +8170,7 @@ def fn(a, b):
         b = torch.rand(2, 2, 1, 4, 1).int()
         self.common(fn, (a, b))
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_argmax_argmin1(self):
         def fn(x):
             return (aten.argmax(x), aten.argmin(x))
@@ -8163,6 +8182,7 @@ def fn(x):
             ],
         )
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_argmax_argmin2(self):
         def fn(x):
             return (
@@ -8174,6 +8194,7 @@ def fn(x):
 
         self.common(fn, (torch.randn([144, 144]),))
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_argmax_argmin_with_duplicates(self):
         def fn(x):
             return (
@@ -8195,6 +8216,7 @@ def fn(x):
         t1 = torch.randint(8, size=(1028, 1028))
         self.common(fn, (t1,))
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_argmax_argmin_with_nan(self):
         def fn(x):
             return (
@@ -8327,6 +8349,7 @@ def fn(x):
             ],
         )
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_tmp_not_defined_issue1(self):
         def forward(
             primals_3,
@@ -8721,6 +8744,7 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 else:
                     self.assertEqual(len(inps), 0)
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_dtype_mismatch_issue(self):
         def fn(x):
             attn = torch.nn.functional.pad(x, [0, 1])
@@ -11388,6 +11412,7 @@ def test_rnn_compile_safe(self):
 
     class NanCheckerTest(TestCase):
         @config.patch("nan_asserts", True)
+        @skipIfRocmArch(NAVI_ARCH)
         def test_nan_checker_pass(self):
             def f(x):
                 return torch.softmax(x, dim=-1)
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
@@ -31,6 +31,7 @@
 from torch.testing._internal.common_methods_invocations import op_db, skipOps
 from torch.testing._internal.common_utils import (
     dtype_abbrs,
+    IS_NAVI,
     IS_MACOS,
     IS_X86,
     skipCUDAMemoryLeakCheckIf,
@@ -203,6 +204,19 @@ def format_op(op):
     # Tensors are not alike
     inductor_skips["cuda"]["logcumsumexp"] = {f32}
     inductor_skips["cuda"]["special.modified_bessel_i1"] = {f64}
+    if IS_NAVI:
+        inductor_skips["cuda"]["aminmax"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["dist"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["kron"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["masked.std"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["masked.var"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"][("max", "reduction_no_dim")] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"][("min", "reduction_no_dim")] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["nn.functional.conv_transpose3d"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["std"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["std_mean"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["var"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["var_mean"] = {b8, f16, f32, f64, i32, i64}
 
 inductor_expected_failures_single_sample = defaultdict(dict)
 
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
@@ -1180,6 +1180,13 @@ def printErrors(self) -> None:
 IS_X86 = platform.machine() in ('x86_64', 'i386')
 IS_ARM64 = platform.machine() in ('arm64', 'aarch64')
 
+IS_NAVI=False
+if torch.cuda.is_available():
+    prop = torch.cuda.get_device_properties(0)
+    gfx_arch = prop.gcnArchName.split(":")[0]
+    if gfx_arch in ["gfx1100", "gfx1101", "gfx1102"]:
+        IS_NAVI = True
+
 def is_avx512_vnni_supported():
     if sys.platform != 'linux':
         return False
@@ -1590,6 +1597,19 @@ def wrapper(*args, **kwargs):
         return dec_fn(func)
     return dec_fn
 
+def skipIfRocmArch(arch: Tuple[str, ...]):
+    def dec_fn(fn):
+        @wraps(fn)
+        def wrap_fn(self, *args, **kwargs):
+            if TEST_WITH_ROCM:
+                prop = torch.cuda.get_device_properties(0)
+                if prop.gcnArchName.split(":")[0] in arch:
+                    reason = f"skipIfRocm: test skipped on {arch}"
+                    raise unittest.SkipTest(reason)
+            return fn(self, *args, **kwargs)
+        return wrap_fn
+    return dec_fn
+
 def runOnRocm(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):
@@ -1599,6 +1619,19 @@ def wrapper(*args, **kwargs):
             raise unittest.SkipTest("test currently only works on the ROCm stack")
     return wrapper
 
+def runOnRocmArch(arch: Tuple[str, ...]):
+    def dec_fn(fn):
+        @wraps(fn)
+        def wrap_fn(self, *args, **kwargs):
+            if TEST_WITH_ROCM:
+                prop = torch.cuda.get_device_properties(0)
+                if prop.gcnArchName.split(":")[0] not in arch:
+                    reason = f"skipIfRocm: test skipped on {arch}"
+                    raise unittest.SkipTest(reason)
+            return fn(self, *args, **kwargs)
+        return wrap_fn
+    return dec_fn
+
 def skipIfXpu(func=None, *, msg="test doesn't currently work on the XPU stack"):
     def dec_fn(fn):
         reason = f"skipIfXpu: {msg}"