Propagate buffer and parameter indices through AOT (pytorch#130393)

mlazos · pytorchmergebot · commit 69a77389e2c4 · 2024-07-16T00:25:38.000Z
Pull Request resolved: pytorch#130393 Approved by: https://github.com/bdhirsh ghstack dependencies: pytorch#130391, pytorch#130392, pytorch#130503
diff --git a/test/dynamo/test_subclasses.py b/test/dynamo/test_subclasses.py
@@ -1526,6 +1526,45 @@ def f(x):
                 out_test = compiled_f(view)
                 self.assertEqual(out_ref, out_test)
 
+    @torch._dynamo.config.patch("inline_inbuilt_nn_modules", True)
+    def test_mark_static_with_subclass_desugaring(self):
+        from typing import Any, Callable, Dict, List, Optional
+
+        from torch._dynamo.decorators import mark_static_address
+        from torch._inductor.compile_fx import compile_fx
+        from torch._inductor.cudagraph_utils import BoxedDeviceIndex
+        from torch._inductor.utils import BoxedBool
+
+        x_inner = torch.ones(4)
+        x = TwoTensor(x_inner, x_inner)
+        mark_static_address(x, guard=False)
+
+        def inner_compile(
+            gm: torch.fx.GraphModule,
+            example_inputs: List[torch.Tensor],
+            cudagraphs: Optional[BoxedBool] = None,
+            static_input_idxs: Optional[List[int]] = None,
+            is_backward: bool = False,
+            graph_id: Optional[int] = None,
+            cpp_wrapper: bool = False,
+            aot_mode: bool = False,
+            is_inference: bool = False,
+            boxed_forward_device_index: Optional[BoxedDeviceIndex] = None,
+            user_visible_outputs: Optional[Dict[str, None]] = None,
+            layout_opt: Optional[bool] = None,
+            extern_node_serializer: Optional[Callable[[List[Any]], Any]] = None,
+        ):
+            self.assertEqual(static_input_idxs, [1, 2])
+            return gm
+
+        compiler = functools.partial(compile_fx, inner_compile=inner_compile)
+
+        @torch.compile(backend=compiler)
+        def fn(t0, t1, t2):
+            return t0 + t1 + t2 + 2
+
+        fn(torch.ones(4), x, torch.ones(4))
+
 
 instantiate_parametrized_tests(SubclassTests)
 
diff --git a/torch/_functorch/_aot_autograd/collect_metadata_analysis.py b/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
@@ -11,7 +11,7 @@
 import collections
 import logging
 from functools import wraps
-from typing import Callable, DefaultDict, Dict, List
+from typing import Callable, DefaultDict, Dict, List, Optional
 
 import torch
 import torch.utils._pytree as pytree
@@ -25,6 +25,7 @@
     is_traceable_wrapper_subclass,
     transform_subclass,
 )
+
 from .functional_utils import (
     are_all_mutations_hidden_from_autograd,
     are_all_mutations_under_no_grad_or_inference_mode,
@@ -124,6 +125,8 @@ def run_functionalized_fw_and_collect_metadata(
     keep_input_mutations: bool,
     # TODO: refactor to kill this flag
     is_train: bool = False,
+    # Note: this is guaranteed to be set when running under dynamo
+    static_input_indices: Optional[List[int]] = None,
     pre_dispatch: bool = False,
 ) -> Callable[..., ViewAndMutationMeta]:
     memo: Dict[Tensor, Tensor] = {}
@@ -666,17 +669,15 @@ def view_avoid_dupes_with_primals(t):
         )
         user_outs = pytree.tree_map(from_fun, f_output_tangents)
 
-        if (
-            torch._dynamo.config.inline_inbuilt_nn_modules
-            or torch._dynamo.compiled_autograd.in_compiled_autograd_region
-        ):
-            static_parameter_input_indices = [
+        nonlocal static_input_indices
+        static_input_indices = static_input_indices or []
+        if torch._dynamo.compiled_autograd.in_compiled_autograd_region:
+            passed_indices = set(static_input_indices)
+            static_input_indices = [
                 i
                 for i, arg in enumerate(flat_args)
-                if isinstance(arg, torch.nn.Parameter)
+                if (isinstance(arg, torch.nn.Parameter) or i in passed_indices)
             ]
-        else:
-            static_parameter_input_indices = []
 
         f_mutated_inputs = [
             inp
@@ -729,7 +730,7 @@ def view_avoid_dupes_with_primals(t):
             subclass_tangent_meta=create_subclass_meta(traced_tangents),
             is_train=is_train,
             grad_enabled_mutation=grad_enabled_mutation,
-            static_parameter_indices=static_parameter_input_indices,
+            static_input_indices=static_input_indices,
             tokens=mode._tokens,
         )
         return metadata
diff --git a/torch/_functorch/_aot_autograd/runtime_wrappers.py b/torch/_functorch/_aot_autograd/runtime_wrappers.py
@@ -905,6 +905,7 @@ def wrapped_flat_fn(*args):
         if config.debug_assert:
             ref_fw_metadata = run_functionalized_fw_and_collect_metadata(
                 wrapped_flat_fn,
+                static_input_indices=aot_config.static_input_indices,
                 keep_input_mutations=fw_metadata.keep_input_mutations,
                 is_train=fw_metadata.is_train,
             )(*deduped_flat_args)
@@ -1094,6 +1095,7 @@ def wrapped_flat_fn(*args):
         if config.debug_assert:
             ref_fw_metadata = run_functionalized_fw_and_collect_metadata(
                 wrapped_flat_fn,
+                static_input_indices=aot_config.static_input_indices,
                 keep_input_mutations=fw_metadata.keep_input_mutations,
                 is_train=fw_metadata.is_train,
             )(*flat_args_with_synthetic_bases)
diff --git a/torch/_functorch/_aot_autograd/schemas.py b/torch/_functorch/_aot_autograd/schemas.py
@@ -329,7 +329,7 @@ class ViewAndMutationMeta:
     deterministic: Optional[bool] = None
 
     # Keeps track of which input indices store parameters (which we will treat as static)
-    static_parameter_indices: List[int] = field(default_factory=list)
+    static_input_indices: List[int] = field(default_factory=list)
 
     # Map of effect type (ex. _EffectType.ORDERED) to token.  If there are
     # side-effectful operators, FunctionalTensorMode will populate this
@@ -803,6 +803,7 @@ class AOTConfig:
     no_tangents: bool = False
     dynamic_shapes: bool = False
     aot_autograd_arg_pos_to_source: Optional[List[Source]] = None
+    static_input_indices: Optional[List[int]] = None
     inference_compiler: Optional[Callable] = None
     enable_log: bool = True
     # this is always false outside of export.
diff --git a/torch/_functorch/_aot_autograd/subclass_utils.py b/torch/_functorch/_aot_autograd/subclass_utils.py
@@ -136,6 +136,24 @@ def concat_inner_tensors_from_subclasses(xs):
     return unwrapped_args
 
 
+def remap_unwrapped_subclass_arg_indices(wrapped_args, static_input_indices):
+    static_input_indices = set(static_input_indices)
+    new_ind = 0
+    remapped_static_indices = []
+    for i, arg in enumerate(wrapped_args):
+        num_indices = 1
+        if is_traceable_wrapper_subclass(arg):
+            num_indices = len(get_plain_tensors(arg))
+
+        for _ in range(num_indices):
+            if i in static_input_indices:
+                remapped_static_indices.append(new_ind)
+
+            new_ind += 1
+
+    return remapped_static_indices
+
+
 # Turns a flattened list of tensor arguments into (maybe) subclass tensors.
 # This function is used both at trace time and runtime, so we have an is_runtime flag telling us which context we're in.
 def wrap_tensor_subclasses(
diff --git a/torch/_functorch/_aot_autograd/traced_function_transforms.py b/torch/_functorch/_aot_autograd/traced_function_transforms.py
@@ -53,6 +53,7 @@
 )
 from .subclass_utils import (
     create_subclass_meta,
+    remap_unwrapped_subclass_arg_indices,
     requires_subclass_dispatch,
     unwrap_tensor_subclasses,
     wrap_tensor_subclasses_maybe_joint,
@@ -702,6 +703,9 @@ def metadata_fn(*primals):
     args_unwrapped = unwrap_tensor_subclasses(
         args, is_joint_structure=is_joint_structure
     )
+    remapped_static_indices = remap_unwrapped_subclass_arg_indices(
+        args, meta.static_input_indices
+    )
 
     if is_joint_structure:
         primals_unwrapped = args_unwrapped[0]
@@ -729,6 +733,7 @@ def metadata_fn(*primals):
     # See Note: [Partitioner handling for Subclasses, Part 2] for more info.
     meta_updated = run_functionalized_fw_and_collect_metadata(
         metadata_fn,
+        static_input_indices=remapped_static_indices,
         keep_input_mutations=meta.keep_input_mutations,
         is_train=meta.is_train,
     )(*primals_unwrapped)
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
@@ -20,6 +20,7 @@
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+
 from . import config
 from ._aot_autograd.autograd_cache import (  # noqa: F401
     AOTAutogradCache,
@@ -588,6 +589,7 @@ def _dup_fake_script_obj(fake_flat_args):
                 with ctx:
                     fw_metadata = run_functionalized_fw_and_collect_metadata(
                         flat_fn,
+                        static_input_indices=aot_config.static_input_indices,
                         keep_input_mutations=aot_config.keep_inference_input_mutations,
                         is_train=needs_autograd,
                         pre_dispatch=aot_config.pre_dispatch,
@@ -623,6 +625,7 @@ def _dup_fake_script_obj(fake_flat_args):
                             keep_input_mutations=aot_config.keep_inference_input_mutations,
                             is_train=False,
                             pre_dispatch=aot_config.pre_dispatch,
+                            static_input_indices=aot_config.static_input_indices,
                         )(*fake_flat_args)
                     else:
                         fw_metadata = ViewAndMutationMeta(
@@ -636,7 +639,7 @@ def _dup_fake_script_obj(fake_flat_args):
                             subclass_tangent_meta=fw_metadata.subclass_tangent_meta,
                             is_train=False,
                             tokens=fw_metadata.tokens,
-                            static_parameter_indices=fw_metadata.static_parameter_indices,
+                            static_input_indices=fw_metadata.static_input_indices,
                         )
 
         if fw_metadata.num_intermediate_bases > 0:
@@ -941,9 +944,10 @@ def aot_module_simplified(
     # Next, the input args
     full_args.extend(args)
 
+    static_input_indices = []
     if hasattr(mod, "graph"):
         # Non dynamo entrypoints can get to here...
-        for node in mod.graph.find_nodes(op="placeholder"):
+        for pos, node in enumerate(mod.graph.find_nodes(op="placeholder")):
             if hasattr(node, "_dynamo_source"):
                 # ... but not here!
                 if aot_autograd_arg_pos_to_source is None:
@@ -953,6 +957,11 @@ def aot_module_simplified(
                 seen_sources.add(source)
                 aot_autograd_arg_pos_to_source.append(source)
 
+                if "tensor_dict" in node.meta and node.meta["tensor_dict"].get(
+                    "_dynamo_static_input_type", None
+                ):
+                    static_input_indices.append(pos)
+
     if aot_autograd_arg_pos_to_source is not None:
         assert len(full_args) == len(aot_autograd_arg_pos_to_source)
 
@@ -973,6 +982,7 @@ def aot_module_simplified(
         keep_inference_input_mutations=keep_inference_input_mutations,
         dynamic_shapes=dynamic_shapes,
         aot_autograd_arg_pos_to_source=aot_autograd_arg_pos_to_source,
+        static_input_indices=static_input_indices,
         is_export=False,
         no_tangents=False,
         cache_key=None,
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
@@ -136,7 +136,7 @@ def get_static_input_idxs(num_fixed):
     if not context or not context.fw_metadata:
         return fixed
 
-    return fixed + context.fw_metadata.static_parameter_indices
+    return fixed + context.fw_metadata.static_input_indices
 
 
 @functools.lru_cache(None)
@@ -1252,7 +1252,7 @@ def fw_compiler_freezing(
                 params_flat[i] = None
 
         if tracing_context.fw_metadata:
-            static_input_idxs += tracing_context.fw_metadata.static_parameter_indices
+            static_input_idxs += tracing_context.fw_metadata.static_input_indices
 
     with mock.patch.object(fake_mode, "allow_non_fake_inputs", True):
         optimized_function = inner_compile(