inductor: avoiding moving constructor to cuda when it would cause h2d sync in index_put_ fallback (pytorch#130338)

bdhirsh · pytorchmergebot · commit 39eeaac4e591 · 2024-07-16T00:48:58.000Z
My attempt at a fix for pytorch#130335, see issue for more details / internal xref. Any feedback from inductor folks is appreciated. I attempted to make the move-constructors-to-cuda pass a bit less aggressive by detecting when the movement would incur a H2D sync for `aten.index_put_`. I'm not sure if there are any other ops that inductor falls back to eager on, that may-or-may-not incur a H2D sync if we change any of their inputs from cpu to cuda. Pull Request resolved: pytorch#130338 Approved by: https://github.com/eellison
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -11436,6 +11436,27 @@ def fn():
 
                 self.assertEqual(fn_opt(), fn())
 
+        # https://github.com/pytorch/pytorch/issues/130335
+        def test_ctr_not_moved_to_cuda_when_used_in_index_put(self):
+            @torch.compile
+            def f(x, mask):
+                x[:, mask] = -math.inf
+                return x
+
+            x_tmp = torch.randn(512, 19, device="cuda")
+            x = x_tmp.permute(1, 0).view(-1, 128, 4)[:, :, 1:]
+
+            mask_tmp = torch.ones(128, 3, dtype=torch.int32, device="cuda")
+            mask = mask_tmp == mask_tmp
+            f(x, mask)
+            code = run_and_get_triton_code(f, x, mask)
+            # What we are testing here:
+            # inductor has a pass to move tensor constructors on cpu to cuda
+            # (the -math.inf will become a scalar-tensor input to index_put_())
+            # we are asserting that when inductor allocates this tensor,
+            # it does not move the tensor constructor to cuda and keeps it on CPU.
+            self.assertFalse("empty_strided_cuda(()" in code)
+
         @config.patch("triton.use_block_ptr", False)
         def test_evict_last_non_coalesced_loads(self):
             @torch.compile
diff --git a/torch/_inductor/fx_passes/post_grad.py b/torch/_inductor/fx_passes/post_grad.py
@@ -1015,6 +1015,35 @@ def fused_int_mm_mul(match: Match, mat1, mat2, mat3, out_dtype=None):
     return inductor.kernel.mm.tuned_fused_int_mm_mul(mat1, mat2, mat3, out_dtype)
 
 
+def is_index_put_and_requires_h2d_sync_for_cuda_value(node):
+    from torch.fx.operator_schemas import normalize_function
+
+    if node.target not in [
+        torch.ops.aten.index_put.default,
+        torch.ops.aten.index_put_.default,
+    ]:
+        return False
+    # Inductor falls back to aten.index_put_.
+    # index_put_ will will call nonzero() and perform a H2D sync if
+    # any of its indices are bool/byte tensors
+    # However, it will short-circuit this H2D sync and run mask_fill_
+    # if the value we are putting is a cpu scalar.
+    # Therefore, when inductor sees an index_put_ with byte tensor indices,
+    # it should *not* convert the cpu scalar value into a cuda tensor.
+    args_, kwargs_ = normalize_function(node.target, node.args, node.kwargs)
+    any_byte_bool_indices = False
+    indices = args_[1]
+    for i in indices:
+        if i is not None and i.meta["val"].dtype in [torch.bool, torch.int8]:
+            any_byte_bool_indices = True
+
+    val = args_[2].meta["val"]
+    val_is_cpu_scalar = val.device.type == "cpu" and val.numel() == 1
+    # If both these conditions hold, then converting the val
+    # to a cuda tensor will incur a H2D sync when inductor calls aten.index_put_
+    return any_byte_bool_indices and val_is_cpu_scalar
+
+
 class ConstructorMoverPass:
     def __init__(self, target: str, allow_outputs: bool = False) -> None:
         """
@@ -1068,6 +1097,8 @@ def cannot_be_moved(self, node: fx.Node) -> bool:
             and node.target.namespace in ("prims", "aten")
         ):
             return True
+        if is_index_put_and_requires_h2d_sync_for_cuda_value(node):
+            return True
 
         return False