[inductor][subgraph][python-wrapper] Lift subgraph code into functions (pytorch#137200)

anijain2305 · pytorchmergebot · commit cd02c85ba400 · 2024-10-11T17:57:10.000Z
Earlier the subgraphs were getting inlined into the output code. This PR lifts the subgraphs into a function, and then we just call the function in the output code. This is the output code for test `test_cond_reintepret_view_inputs_outputs` Before this PR - https://www.internalfb.com/intern/paste/P1632948905/ With this PR - https://www.internalfb.com/intern/paste/P1632946348/ A relevant snippet from the above paste is ~~~ def false_graph_0(args): false_graph_0_arg0_1, false_graph_0_arg1_1, s0 = args args.clear() s0 = s0 with torch.cuda._DeviceGuard(0): torch.cuda.set_device(0) false_graph_0_buf0 = empty_strided_cuda(((-1) + s0, 20), (20, 1), torch.float32) false_graph_0_buf1 = empty_strided_cuda(((-1) + s0, 20), (20, 1), torch.float32) # Unsorted Source Nodes: [cond, z1, z2], Original ATen: [aten.sub, aten.add] triton_poi_fused_add_sub_1_xnumel = (-20) + (20*s0) stream0 = get_raw_stream(0) triton_poi_fused_add_sub_1.run(false_graph_0_arg0_1, false_graph_0_arg1_1, false_graph_0_buf0, false_graph_0_buf1, triton_poi_fused_add_sub_1_xnumel, grid=grid(triton_poi_fused_add_sub_1_xnumel), stream=stream0) del false_graph_0_arg0_1 del false_graph_0_arg1_1 return (reinterpret_tensor(false_graph_0_buf0, ((-3) + s0, 20), (20, 1), 40), reinterpret_tensor(false_graph_0_buf1, ((-1) + s0, 16), (20, 1), 4), ) async_compile.wait(globals()) del async_compile def call(args): arg0_1, arg1_1, arg2_1, arg3_1 = args args.clear() s0 = arg0_1 assert_size_stride(arg1_1, (s0, 20), (20, 1)) assert_size_stride(arg2_1, (s0, 20), (20, 1)) assert_size_stride(arg3_1, (), ()) with torch.cuda._DeviceGuard(0): torch.cuda.set_device(0) buf0 = [None] * 2 buf0 = [None] * 2 if arg3_1.item(): # subgraph: true_graph_0 true_graph_0_arg0_1 = reinterpret_tensor(arg1_1, ((-1) + s0, 20), (20, 1), 0) true_graph_0_arg1_1 = reinterpret_tensor(arg2_1, ((-1) + s0, 20), (20, 1), 0) (true_graph_0_buf0, true_graph_0_buf1) = true_graph_0([true_graph_0_arg0_1, true_graph_0_arg1_1, s0]) buf0[0] = true_graph_0_buf0 buf0[1] = true_graph_0_buf1 else: # subgraph: false_graph_0 false_graph_0_arg0_1 = reinterpret_tensor(arg1_1, ((-1) + s0, 20), (20, 1), 0) false_graph_0_arg1_1 = reinterpret_tensor(arg2_1, ((-1) + s0, 20), (20, 1), 0) (false_graph_0_buf0, false_graph_0_buf1) = false_graph_0([false_graph_0_arg0_1, false_graph_0_arg1_1, s0]) buf0[0] = false_graph_0_buf0 buf0[1] = false_graph_0_buf1 del arg1_1 del arg2_1 del arg3_1 buf1 = buf0[0] buf2 = buf0[1] del buf0 return (buf1, buf2, ) ~~~ The key change is to recursively call `codegen` for the subgraph and rely on `SubgraphPythonWrapper` to generate just the subgraph `fn`. The resulting subgraph_code is then inserted into the parent wrapper. Note that this PR only works for python wrapper. For cpp wrapper, we need a lot of refactor to ensure that we don't duplicate the global variables in the outpute_code. So, for now, I fallback to the old way of inlining for cpp wrapper. I am hoping someone with more familiarity with cpp wrapper can support subgraph lifting (cc @voznesenskym @penguinwu @EikanWang @jgong5 @Guobing-Chen @XiaobingSuper @zhuhaozhe @blzheng @wenzhe-nrv @jiayisunx @ipiszy @yf225 @chenyang78 @kadeng @muchulee8 @ColinPeppler @amjames @desertfire @chauhang @aakhundov). This work will unblock hierarchical compilation (or cold start compile time work). Pull Request resolved: pytorch#137200 Approved by: https://github.com/desertfire, https://github.com/eellison
diff --git a/test/inductor/test_control_flow.py b/test/inductor/test_control_flow.py
@@ -335,6 +335,98 @@ def false_fn(x, y):
             dynamic=True,
         )
 
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    def test_cond_unbacked_symint_outer_to_inner(self, device):
+        class Model(torch.nn.Module):
+            def forward(self, p, a):
+                def true_fn(x):
+                    return torch.cos(x)
+
+                def false_fn(x):
+                    return torch.sin(x)
+
+                nz = torch.nonzero(a)
+                b = torch.ones([nz.size(0), 8], device=nz.device)
+
+                return torch.cond(p, true_fn, false_fn, [b])
+
+        with torch._dynamo.config.patch(
+            {
+                "capture_dynamic_output_shape_ops": True,
+            }
+        ):
+            self._run_test(
+                model=Model(),
+                inputs=(torch.randn(2, 3, 3),),
+                device=device,
+                dynamic=True,
+            )
+
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    def test_cond_unbacked_symint_inner(self, device):
+        class Model(torch.nn.Module):
+            def forward(self, p, a):
+                def true_fn(x):
+                    nz = torch.nonzero(x)
+                    b = torch.ones([nz.size(0), 8], device=nz.device)
+                    return torch.cos(b)
+
+                def false_fn(x):
+                    nz = torch.nonzero(x)
+                    b = torch.ones([nz.size(0), 8], device=nz.device)
+                    return torch.sin(b)
+
+                b = torch.sin(a)
+
+                return torch.cond(p, true_fn, false_fn, [b])
+
+        with torch._dynamo.config.patch(
+            {
+                "capture_dynamic_output_shape_ops": True,
+            }
+        ):
+            self._run_test(
+                model=Model(),
+                inputs=(torch.randn(2, 3, 3),),
+                device=device,
+                dynamic=True,
+            )
+
+    @unittest.skip("unbacked symints from inner to outer graph not supported yet")
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    def test_cond_unbacked_symint_inner_to_outer(self, device):
+        class Model(torch.nn.Module):
+            def forward(self, p, a):
+                def true_fn(x):
+                    nz = torch.nonzero(x)
+                    b = torch.ones([nz.size(0), 8], device=nz.device)
+                    return torch.cos(b)
+
+                def false_fn(x):
+                    nz = torch.nonzero(x)
+                    b = torch.ones([nz.size(0), 8], device=nz.device)
+                    return torch.sin(b)
+
+                b = torch.sin(a)
+
+                y = torch.cond(p, true_fn, false_fn, [b])
+                return torch.sin(y)
+
+        with torch._dynamo.config.patch(
+            {
+                "capture_dynamic_output_shape_ops": True,
+            }
+        ):
+            self._run_test(
+                model=Model(),
+                inputs=(torch.randn(2, 3, 3),),
+                device=device,
+                dynamic=True,
+            )
+
     @requires_gpu
     def test_cond_use_buffers_from_outer_scope(self):
         # subgraphs input shapes include symbolic expressions
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -70,6 +70,14 @@ def __init__(self):
         self.initialized_kernels: Dict[str, Kernel] = {}
         self.expr_printer = cexpr
 
+    @staticmethod
+    def create(
+        is_subgraph: bool, subgraph_name: str, parent_wrapper: PythonWrapperCodegen
+    ):
+        # TODO - support subgraph codegen by lifting functions. Check the
+        # comment at CppWrapperCpu `codegen_subgraph` function.
+        return CppWrapperCpu()
+
     def generate_kernel_call(
         self,
         kernel_name: str,
@@ -1912,6 +1920,25 @@ def codegen_conditional(self, conditional):
         self.writeline(ExitSubgraphLine(self))
         self.writeline("}")
 
+    def codegen_subgraph(self, subgraph, outer_inputs, outer_outputs):
+        # TODO (desertfire) - This function is the old way of supporting
+        # subgraph codegen by inlining subgraphs in the output code. For python
+        # wrapper, we have moved to lifting subgraphs as functions, supported by
+        # PythonWrapperCode `codegen_subgraph` function. We should perhaps
+        # support lifting of subgraphs as functions for cpp wrapper as well.
+        try:
+            self.push_codegened_graph(subgraph.graph)
+            self.writeline(f"{self.comment} subgraph: {subgraph.name}")
+            self.codegen_subgraph_prefix(subgraph, outer_inputs, outer_outputs)
+            parent_graph = V.graph
+            with V.set_graph_handler(subgraph.graph):
+                subgraph.graph.codegen_subgraph(
+                    parent_graph=parent_graph,
+                )
+            self.codegen_subgraph_suffix(subgraph, outer_inputs, outer_outputs)
+        finally:
+            self.pop_codegened_graph()
+
     def codegen_while_loop(self, while_loop):
         name = while_loop.get_name()
         outer_carried_inputs = [
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
@@ -15,6 +15,7 @@
     ExitSubgraphLine,
     MemoryPlanningLine,
     MemoryPlanningState,
+    PythonWrapperCodegen,
 )
 
 
@@ -72,6 +73,14 @@ def __init__(self):
         self.allow_stack_allocation: Optional[bool] = None
         self.stack_allocated_buffers: Dict[BufferName, ir.Buffer] = {}
 
+    @staticmethod
+    def create(
+        is_subgraph: bool, subgraph_name: str, parent_wrapper: PythonWrapperCodegen
+    ):
+        # TODO - support subgraph codegen by lifting functions. Check the
+        # comment at CppWrapperCpu `codegen_subgraph` function.
+        return CppWrapperCpuArrayRef()
+
     def memory_plan(self):
         from .memory_planning import MemoryPlanner
 
diff --git a/torch/_inductor/codegen/cpp_wrapper_gpu.py b/torch/_inductor/codegen/cpp_wrapper_gpu.py
@@ -18,7 +18,7 @@
 from .common import get_device_op_overrides
 from .cpp_utils import cexpr, DTYPE_TO_CPP
 from .cpp_wrapper_cpu import CppWrapperCpu
-from .wrapper import SymbolicCallArg
+from .wrapper import PythonWrapperCodegen, SymbolicCallArg
 
 
 if TYPE_CHECKING:
@@ -171,6 +171,14 @@ def __init__(self) -> None:
         super().__init__()
         self.grid_id = count()
 
+    @staticmethod
+    def create(
+        is_subgraph: bool, subgraph_name: str, parent_wrapper: PythonWrapperCodegen
+    ):
+        # TODO - support subgraph codegen by lifting functions. Check the
+        # comment at CppWrapperCpu `codegen_subgraph` function.
+        return CppWrapperGpu()
+
     def write_header(self):
         if V.graph.is_const_graph:
             # We do not write header for constant graph, it will be written by main module.
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py