justinchuby
diff --git a/‎test/dynamo/test_autograd_function.py
Lines changed: 98 additions & 6 deletions b/‎test/dynamo/test_autograd_function.py
Lines changed: 98 additions & 6 deletions
diff --git a/‎test/dynamo/test_export.py
Lines changed: 16 additions & 16 deletions b/‎test/dynamo/test_export.py
Lines changed: 16 additions & 16 deletions
diff --git a/‎test/dynamo/test_functions.py
Lines changed: 4 additions & 0 deletions b/‎test/dynamo/test_functions.py
Lines changed: 4 additions & 0 deletions
@@ -4,12 +4,14 @@
 import math
 
 from dataclasses import dataclass
+from typing import List
 
 import torch
 
 import torch._dynamo.test_case
 import torch._dynamo.testing
 import torch._dynamo.utils
+from functorch.compile import aot_module_simplified
 from torch.testing._internal.triton_utils import HAS_CUDA, requires_cuda
 
 if HAS_CUDA:
@@ -223,6 +225,69 @@ def forward(self, x):
         return self.f(x)
 
 
+@torch.library.custom_op("_torch_testing::custom_op_forward", mutates_args=())
+def custom_op_forward(
+    foo: torch.Tensor,
+    bar: torch.Tensor,
+    shape: List[int],
+) -> torch.Tensor:
+    return torch.ones_like(foo)
+
+
+@custom_op_forward.register_fake
+def _(foo, bar, weight):
+    return torch.empty_like(foo)
+
+
+@torch.library.custom_op("_torch_testing::custom_op_backward", mutates_args=())
+def custom_op_backward(
+    grad_output: torch.Tensor,
+    foo: torch.Tensor,
+    bar: torch.Tensor,
+    shape: List[int],
+) -> torch.Tensor:
+    assert list(bar.shape) == shape
+    return torch.ones_like(bar)
+
+
+@custom_op_backward.register_fake
+def _(grad_output, foo, bar, shape):
+    return torch.empty_like(bar)
+
+
+class CustomOpFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, weight, normalized_shape):
+        ctx.normalized_shape = normalized_shape
+        input_ = input.contiguous()
+        weight_ = weight.contiguous()
+        output = custom_op_forward(input_, weight_, ctx.normalized_shape)
+        ctx.save_for_backward(input_, weight_)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input_, weight_ = ctx.saved_tensors
+        # grad_weight = a_func(grad_output, input_, weight_, ctx.normalized_shape)
+        grad_weight = custom_op_backward(
+            grad_output.contiguous(),
+            input_,
+            weight_,
+            ctx.normalized_shape,
+        )
+        return None, grad_weight, None
+
+
+class CustomOpModule(torch.nn.Module):
+    def __init__(self, shape):
+        super().__init__()
+        self.shape = shape
+        self.weight = torch.nn.Parameter(torch.ones(self.shape))
+
+    def forward(self, x):
+        return CustomOpFunc.apply(x, self.weight, self.shape)
+
+
 class AutogradFunctionTests(torch._dynamo.test_case.TestCase):
     # Sound behaviors, tested for working capture
     def test_autograd_function_equivalence(self):
@@ -527,18 +592,29 @@ def forward(self, L_x_: "f32[]", L_z_: "f32[]", L_weird_b: "f32[]", L_weird_c: "
 
     class GraphModule(torch.nn.Module):
         def forward(self, ctx, x: "f32[]", z: "f32[]", l_weird_b: "f32[]", l_weird_c: "f32[]"):
-            mul: "f32[]" = l_weird_b * l_weird_c
-            clone: "f32[]" = x.clone();  x = None
+            ctx_1 = ctx
+            x_1 = x
+            z_1 = z
+            l_weird_b_1 = l_weird_b
+            l_weird_c_1 = l_weird_c
+
+            mul: "f32[]" = l_weird_b_1 * l_weird_c_1
+            clone: "f32[]" = x_1.clone();  x_1 = None
             mul_1: "f32[]" = mul * clone;  mul = clone = None
-            return (mul_1, [l_weird_b, l_weird_c])
+            return (mul_1, [l_weird_b_1, l_weird_c_1])
 
     class GraphModule(torch.nn.Module):
         def forward(self, ctx, grad: "f32[]", l_weird_b: "f32[]", l_weird_c: "f32[]"):
+            ctx_1 = ctx
+            grad_1 = grad
+            l_weird_b_1 = l_weird_b
+            l_weird_c_1 = l_weird_c
+
             _set_grad_enabled = torch._C._set_grad_enabled(False)
 
-            mul: "f32[]" = grad * l_weird_b;  l_weird_b = None
-            mul_1: "f32[]" = mul * l_weird_c;  mul = l_weird_c = None
-            mul_2: "f32[]" = grad * 2;  grad = None
+            mul: "f32[]" = grad_1 * l_weird_b_1;  l_weird_b_1 = None
+            mul_1: "f32[]" = mul * l_weird_c_1;  mul = l_weird_c_1 = None
+            mul_2: "f32[]" = grad_1 * 2;  grad_1 = None
 
             _set_grad_enabled_1 = torch._C._set_grad_enabled(True)
             return (mul_1, mul_2)
@@ -1103,6 +1179,22 @@ def fn():
         self.assertEqual(cnt.frame_count, 1)
         self.assertEqual(cnt.op_count, 2)
 
+    def test_custom_op(self):
+        shape = [7]
+        x = torch.rand(128, shape[0])
+        model = CustomOpModule(shape)
+        out = model(x)
+
+        def backend(gm, example_inputs):
+            return aot_module_simplified(
+                gm, example_inputs, fw_compiler=lambda gm, _: gm
+            )
+
+        opt_model = torch.compile(model, backend=backend)
+        opt_out = opt_model(x)
+        opt_out.mean().backward()
+        self.assertEqual(out, opt_out)
+
     @requires_cuda
     def test_triton_kernel_basic(self):
         class Add(torch.autograd.Function):
 
@@ -1894,16 +1894,16 @@ def forward(self, x):
                 out_graph.cond_true_0.code.strip(),
                 """\
 def forward(self, l_x_):
-    l_x__1 = l_x_
-    add = l_x__1 + l_x__1;  l_x__1 = None
+    l_x__2 = l_x_
+    add = l_x__2 + l_x__2;  l_x__2 = None
     return (add,)""",
             )
             self.assertExpectedInline(
                 out_graph.cond_false_0.code.strip(),
                 """\
 def forward(self, l_x_):
-    l_x__1 = l_x_
-    getitem = l_x__1[slice(None, 2, None)];  l_x__1 = None
+    l_x__2 = l_x_
+    getitem = l_x__2[slice(None, 2, None)];  l_x__2 = None
     return (getitem,)""",
             )
             with self.assertRaisesRegex(
@@ -3947,13 +3947,13 @@ def forward(self, pred, x):
             out_graph.cond_true_0.code.strip(),
             """\
 def forward(self, a, b, l_x_, d_true_branch, c_false_branch):
-    a_1 = a
-    b_1 = b
-    l_x__1 = l_x_
-    add = l_x__1 + l_x__1;  l_x__1 = None
-    cos = a_1.cos();  a_1 = None
+    a_2 = a
+    b_2 = b
+    l_x__2 = l_x_
+    add = l_x__2 + l_x__2;  l_x__2 = None
+    cos = a_2.cos();  a_2 = None
     add_1 = add + cos;  add = cos = None
-    cos_1 = b_1.cos();  b_1 = None
+    cos_1 = b_2.cos();  b_2 = None
     add_2 = add_1 + cos_1;  add_1 = cos_1 = None
     cos_2 = d_true_branch.cos();  d_true_branch = None
     add_3 = add_2 + cos_2;  add_2 = cos_2 = None
@@ -3964,13 +3964,13 @@ def forward(self, a, b, l_x_, d_true_branch, c_false_branch):
             out_graph.cond_false_0.code.strip(),
             """\
 def forward(self, a, b, l_x_, d_true_branch, c_false_branch):
-    a_1 = a
-    b_1 = b
-    l_x__1 = l_x_
-    mul = l_x__1 * l_x__1;  l_x__1 = None
-    sin = a_1.sin();  a_1 = None
+    a_2 = a
+    b_2 = b
+    l_x__2 = l_x_
+    mul = l_x__2 * l_x__2;  l_x__2 = None
+    sin = a_2.sin();  a_2 = None
     add = mul + sin;  mul = sin = None
-    sin_1 = b_1.sin();  b_1 = None
+    sin_1 = b_2.sin();  b_2 = None
     add_1 = add + sin_1;  add = sin_1 = None
     sin_2 = c_false_branch.sin();  c_false_branch = None
     add_2 = add_1 + sin_2;  add_1 = sin_2 = None
 
@@ -1964,6 +1964,7 @@ def forward(self, L_lambda0_keywords_y_: "f32[2, 2]"):
                 """\
 class GraphModule(torch.nn.Module):
     def forward(self, s0: "Sym(s0)", L_lambda0_keywords_y_: "f32[s0, s0]"):
+        s0_1 = s0
         l_lambda0_keywords_y_ = L_lambda0_keywords_y_
 
         mul: "f32[s0, s0]" = l_lambda0_keywords_y_ * l_lambda0_keywords_y_
@@ -2012,6 +2013,7 @@ def forward(self, L_lambda0_keywords_y_: "f32[2, 2]"):
                 """\
 class GraphModule(torch.nn.Module):
     def forward(self, s0: "Sym(s0)", L_lambda0_keywords_y_: "f32[s0, s0]"):
+        s0_1 = s0
         l_lambda0_keywords_y_ = L_lambda0_keywords_y_
 
         mul: "f32[s0, s0]" = l_lambda0_keywords_y_ * l_lambda0_keywords_y_
@@ -2063,6 +2065,7 @@ def forward(self, L_lambda0_keywords_y_: "f32[2, 2]"):
                 """\
 class GraphModule(torch.nn.Module):
     def forward(self, s0: "Sym(s0)", L_lambda0_keywords_y_: "f32[s0, s0]"):
+        s0_1 = s0
         l_lambda0_keywords_y_ = L_lambda0_keywords_y_
 
         mul: "f32[s0, s0]" = l_lambda0_keywords_y_ * l_lambda0_keywords_y_
@@ -2111,6 +2114,7 @@ def forward(self, L_x_: "f32[2, 2]"):
                 """\
 class GraphModule(torch.nn.Module):
     def forward(self, s0: "Sym(s0)", L_x_: "f32[s0, s0]"):
+        s0_1 = s0
         l_x_ = L_x_
 
         mul: "f32[s0, s0]" = l_x_ * 4