Add skipIfRocmArch decorator for Navi skips (#1356)

jataylo · pruthvistony · commit 90ac50846d81 · 2024-04-22T17:46:11.000-07:00
diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py
@@ -23,6 +23,7 @@
     freeze_rng_state,
     IS_FBCODE,
     skipIfRocm,
+    skipIfRocmArch,
     TEST_WITH_ASAN,
 )
 
@@ -42,7 +43,7 @@
         sys.exit(0)
     raise
 
-
+NAVI_ARCH = ("gfx1100", "gfx1101") # Used for navi exclusive skips on ROCm
 TestCase = test_torchinductor.TestCase
 ToTuple = test_torchinductor.ToTuple
 check_model_cuda = test_torchinductor.check_model_cuda
@@ -307,6 +308,7 @@ def foo(x):
         out_ref.add_(2)
         # self.assertEqual(out_ref, out)
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_accuracy_issue1(self):
         class Repro(torch.nn.Module):
             def __init__(self):
@@ -343,6 +345,7 @@ def forward(self, start_positions: torch.Tensor, x: torch.Tensor):
             assert same_two_models(mod, opt_mod, args), "Dynamo failed"
 
     @config.patch(allow_buffer_reuse=False)
+    @skipIfRocmArch(NAVI_ARCH)
     def test_issue103461(self):
         def forward(add_1):
             var_mean = torch.ops.aten.var_mean.correction(
@@ -826,6 +829,7 @@ def forward(self, x):
             res2 = jit_func(x)
             self.assertEqual(res1, res2)
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_issue103481(self):
         def fn(x, y):
             # NOTE: 6 dimensions is important! does not fail for 5 dimensions
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -72,7 +72,7 @@
     parametrize,
     serialTest,
     skipIfRocm,
-    subtest,
+    skipIfRocmArch,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
 )
@@ -118,6 +118,7 @@
 _desired_test_bases = get_desired_device_type_test_bases()
 RUN_CPU = any(getattr(x, "device_type", "") == "cpu" for x in _desired_test_bases)
 RUN_GPU = any(getattr(x, "device_type", "") == GPU_TYPE for x in _desired_test_bases)
+NAVI_ARCH = ("gfx1100", "gfx1101") # Used for navi exclusive skips on ROCm
 
 aten = torch.ops.aten
 requires_gpu = functools.partial(unittest.skipIf, not HAS_GPU, "requires gpu")
@@ -1246,6 +1247,7 @@ def fn(x):
             # make sure things also work if they aren't unrolled
             self.common(fn, (torch.randn(8, 3),))
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_multilayer_sum_low_prec(self):
         # fp16 nyi for cpu
         if self.device == "cpu":
@@ -1256,6 +1258,7 @@ def fn(a):
 
         self.common(fn, ((torch.rand((10, 3, 352, 352), dtype=torch.float16),)))
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_multilayer_prime_size(self):
         def fn(a):
             return torch.max(a), torch.sum(a)
@@ -1265,6 +1268,7 @@ def fn(a):
         sample[-1] = 1
         self.common(fn, (sample,))
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_multilayer_var(self):
         def fn(a):
             return torch.var(a)
@@ -2131,6 +2135,7 @@ def fn(a, b):
 
         self.common(fn, (torch.randn(8, 8), torch.randn(8, 8)))
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_large_tensor_reduction(self):
         if not _has_sufficient_memory(self.device, 4.5 * 1024**3):  # 4.5 GiB
             raise unittest.SkipTest("insufficient memory")
@@ -2151,6 +2156,7 @@ def fn(a):
         expect = torch.tensor(2, dtype=torch.int8, device=self.device)
         self.assertEqual(actual, expect)
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_large_broadcast_reduction(self):
         if self.device == "cpu":
             raise unittest.SkipTest("Fails on CPU")
@@ -3204,6 +3210,7 @@ def test_conv2d_channels_last(self):
             check_lowp=False,
         )
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_conv2d_backward_channels_last(self):
         def fn(grad_output, inp, weight):
             convolution_backward_8 = torch.ops.aten.convolution_backward.default(
@@ -3949,6 +3956,7 @@ def fn(x, y):
         self.assertEqual(a.stride(), c.stride())
         self.assertEqual(c.stride()[2], 1)
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_std(self):
         def fn(x):
             return (
@@ -3991,6 +3999,7 @@ def test_batch_norm_2d(self):
 
     # From yolov3
     @with_tf32_off
+    @skipIfRocmArch(NAVI_ARCH)
     def test_batch_norm_2d_2(self):
         if self.device == "cpu":
             raise unittest.SkipTest(f"requires {GPU_TYPE}")
@@ -4126,6 +4135,7 @@ def fn(x):
 
         self.common(fn, (x,))
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_cauchy(self):
         def fn(x, y):
             return torch.sum(1 / (torch.unsqueeze(x, -1) - y))
@@ -5395,6 +5405,7 @@ def fn(a):
         y = fn_compiled(x)
         self.assertTrue(y is not x)
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_l1_loss(self):
         def fn(a, b):
             return torch.nn.functional.l1_loss(a, b), torch.nn.functional.mse_loss(a, b)
@@ -5791,6 +5802,7 @@ def fn(x):
             fn, (torch.tensor([1, float("inf"), 2, float("-inf"), float("nan")]),)
         )
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_any(self):
         def fn(x):
             return (
@@ -6478,6 +6490,8 @@ def fn(a, dim, index, b, reduce):
                 ],
             )
 
+    # issue #1150
+    @skipIfRocmArch(NAVI_ARCH)
     def test_dense_mask_index(self):
         r"""
         There will be a little difference for reduce order between aten and inductor
@@ -7361,6 +7375,7 @@ def fn(a, b):
         b = torch.rand(2, 2, 1, 4, 1).int()
         self.common(fn, (a, b))
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_argmax_argmin1(self):
         def fn(x):
             return (aten.argmax(x), aten.argmin(x))
@@ -7372,6 +7387,7 @@ def fn(x):
             ],
         )
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_argmax_argmin2(self):
         def fn(x):
             return (
@@ -7383,6 +7399,7 @@ def fn(x):
 
         self.common(fn, (torch.randn([144, 144]),))
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_argmax_argmin_with_duplicates(self):
         def fn(x):
             return (
@@ -7404,6 +7421,7 @@ def fn(x):
         t1 = torch.randint(8, size=(1028, 1028))
         self.common(fn, (t1,))
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_argmax_argmin_with_nan(self):
         def fn(x):
             return (
@@ -7536,6 +7554,7 @@ def fn(x):
             ],
         )
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_tmp_not_defined_issue1(self):
         def forward(
             primals_3,
@@ -7930,6 +7949,7 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 else:
                     self.assertEqual(len(inps), 0)
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_dtype_mismatch_issue(self):
         def fn(x):
             attn = torch.nn.functional.pad(x, [0, 1])
@@ -10414,6 +10434,7 @@ def test_rnn_compile_safe(self):
 
     class NanCheckerTest(TestCase):
         @config.patch("nan_asserts", True)
+        @skipIfRocmArch(NAVI_ARCH)
         def test_nan_checker_pass(self):
             def f(x):
                 return torch.softmax(x, dim=-1)
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
@@ -31,6 +31,7 @@
 from torch.testing._internal.common_methods_invocations import op_db, skipOps
 from torch.testing._internal.common_utils import (
     dtype_abbrs,
+    IS_NAVI,
     IS_MACOS,
     IS_X86,
     skipCUDAMemoryLeakCheckIf,
@@ -203,6 +204,19 @@ def format_op(op):
     # Tensors are not alike
     inductor_skips["cuda"]["logcumsumexp"] = {f32}
     inductor_skips["cuda"]["special.modified_bessel_i1"] = {f64}
+    if IS_NAVI:
+        inductor_skips["cuda"]["aminmax"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["dist"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["kron"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["masked.std"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["masked.var"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"][("max", "reduction_no_dim")] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"][("min", "reduction_no_dim")] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["nn.functional.conv_transpose3d"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["std"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["std_mean"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["var"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["var_mean"] = {b8, f16, f32, f64, i32, i64}
 
 inductor_expected_failures_single_sample = defaultdict(dict)
 
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
@@ -1177,6 +1177,13 @@ def printErrors(self) -> None:
 IS_X86 = platform.machine() in ('x86_64', 'i386')
 IS_ARM64 = platform.machine() in ('arm64', 'aarch64')
 
+IS_NAVI=False
+if torch.cuda.is_available():
+    prop = torch.cuda.get_device_properties(0)
+    gfx_arch = prop.gcnArchName.split(":")[0]
+    if gfx_arch in ["gfx1100", "gfx1101", "gfx1102"]:
+        IS_NAVI = True
+
 def is_avx512_vnni_supported():
     if sys.platform != 'linux':
         return False
@@ -1560,6 +1567,19 @@ def wrapper(*args, **kwargs):
         return dec_fn(func)
     return dec_fn
 
+def skipIfRocmArch(arch: Tuple[str, ...]):
+    def dec_fn(fn):
+        @wraps(fn)
+        def wrap_fn(self, *args, **kwargs):
+            if TEST_WITH_ROCM:
+                prop = torch.cuda.get_device_properties(0)
+                if prop.gcnArchName.split(":")[0] in arch:
+                    reason = f"skipIfRocm: test skipped on {arch}"
+                    raise unittest.SkipTest(reason)
+            return fn(self, *args, **kwargs)
+        return wrap_fn
+    return dec_fn
+
 def runOnRocm(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):
@@ -1569,6 +1589,19 @@ def wrapper(*args, **kwargs):
             raise unittest.SkipTest("test currently only works on the ROCm stack")
     return wrapper
 
+def runOnRocmArch(arch: Tuple[str, ...]):
+    def dec_fn(fn):
+        @wraps(fn)
+        def wrap_fn(self, *args, **kwargs):
+            if TEST_WITH_ROCM:
+                prop = torch.cuda.get_device_properties(0)
+                if prop.gcnArchName.split(":")[0] not in arch:
+                    reason = f"skipIfRocm: test skipped on {arch}"
+                    raise unittest.SkipTest(reason)
+            return fn(self, *args, **kwargs)
+        return wrap_fn
+    return dec_fn
+
 def skipIfXpu(func=None, *, msg="test doesn't currently work on the XPU stack"):
     def dec_fn(fn):
         reason = f"skipIfXpu: {msg}"