Propagate buffer and parameter indices through AOT

mlazos · mlazos · commit 2759a7e0a80b · 2024-07-09T20:11:56.000-07:00
ghstack-source-id: 90b6a05 Pull Request resolved: #130393
diff --git a/torch/_functorch/_aot_autograd/collect_metadata_analysis.py b/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
@@ -11,7 +11,7 @@
 import collections
 import logging
 from functools import wraps
-from typing import Callable, DefaultDict, Dict, List
+from typing import Callable, DefaultDict, Dict, List, Optional
 
 import torch
 import torch.utils._pytree as pytree
@@ -25,6 +25,7 @@
     is_traceable_wrapper_subclass,
     transform_subclass,
 )
+
 from .functional_utils import (
     are_all_mutations_hidden_from_autograd,
     are_all_mutations_under_no_grad_or_inference_mode,
@@ -124,6 +125,7 @@ def run_functionalized_fw_and_collect_metadata(
     keep_input_mutations: bool,
     # TODO: refactor to kill this flag
     is_train: bool = False,
+    static_input_indices: Optional[List[int]] = None,
     pre_dispatch: bool = False,
 ) -> Callable[..., ViewAndMutationMeta]:
     memo: Dict[Tensor, Tensor] = {}
@@ -666,18 +668,6 @@ def view_avoid_dupes_with_primals(t):
         )
         user_outs = pytree.tree_map(from_fun, f_output_tangents)
 
-        if (
-            torch._dynamo.config.inline_inbuilt_nn_modules
-            or torch._dynamo.compiled_autograd.in_compiled_autograd_region
-        ):
-            static_parameter_input_indices = [
-                i
-                for i, arg in enumerate(flat_args)
-                if isinstance(arg, torch.nn.Parameter)
-            ]
-        else:
-            static_parameter_input_indices = []
-
         f_mutated_inputs = [
             inp
             for inp, info in zip(flat_f_args, input_info)
@@ -729,7 +719,7 @@ def view_avoid_dupes_with_primals(t):
             subclass_tangent_meta=create_subclass_meta(traced_tangents),
             is_train=is_train,
             grad_enabled_mutation=grad_enabled_mutation,
-            static_parameter_indices=static_parameter_input_indices,
+            static_input_indices=static_input_indices if static_input_indices else [],
             tokens=mode._tokens,
         )
         return metadata
diff --git a/torch/_functorch/_aot_autograd/runtime_wrappers.py b/torch/_functorch/_aot_autograd/runtime_wrappers.py
@@ -905,6 +905,7 @@ def wrapped_flat_fn(*args):
         if config.debug_assert:
             ref_fw_metadata = run_functionalized_fw_and_collect_metadata(
                 wrapped_flat_fn,
+                static_input_indices=aot_config.static_input_indices,
                 keep_input_mutations=fw_metadata.keep_input_mutations,
                 is_train=fw_metadata.is_train,
             )(*deduped_flat_args)
@@ -1094,6 +1095,7 @@ def wrapped_flat_fn(*args):
         if config.debug_assert:
             ref_fw_metadata = run_functionalized_fw_and_collect_metadata(
                 wrapped_flat_fn,
+                static_input_indices=aot_config.static_input_indices,
                 keep_input_mutations=fw_metadata.keep_input_mutations,
                 is_train=fw_metadata.is_train,
             )(*flat_args_with_synthetic_bases)
diff --git a/torch/_functorch/_aot_autograd/schemas.py b/torch/_functorch/_aot_autograd/schemas.py
@@ -328,7 +328,7 @@ class ViewAndMutationMeta:
     deterministic: Optional[bool] = None
 
     # Keeps track of which input indices store parameters (which we will treat as static)
-    static_parameter_indices: List[int] = field(default_factory=list)
+    static_input_indices: List[int] = field(default_factory=list)
 
     # Map of effect type (ex. _EffectType.ORDERED) to token.  If there are
     # side-effectful operators, FunctionalTensorMode will populate this
@@ -802,6 +802,7 @@ class AOTConfig:
     no_tangents: bool = False
     dynamic_shapes: bool = False
     aot_autograd_arg_pos_to_source: Optional[List[Source]] = None
+    static_input_indices: Optional[List[int]] = None
     inference_compiler: Optional[Callable] = None
     enable_log: bool = True
     # this is always false outside of export.
diff --git a/torch/_functorch/_aot_autograd/traced_function_transforms.py b/torch/_functorch/_aot_autograd/traced_function_transforms.py
@@ -729,6 +729,7 @@ def metadata_fn(*primals):
     # See Note: [Partitioner handling for Subclasses, Part 2] for more info.
     meta_updated = run_functionalized_fw_and_collect_metadata(
         metadata_fn,
+        static_input_indices=meta.static_input_indices,
         keep_input_mutations=meta.keep_input_mutations,
         is_train=meta.is_train,
     )(*primals_unwrapped)
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
@@ -20,6 +20,7 @@
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+
 from . import config
 from ._aot_autograd.autograd_cache import (  # noqa: F401
     AOTAutogradCache,
@@ -583,11 +584,13 @@ def _dup_fake_script_obj(fake_flat_args):
                 with ctx:
                     fw_metadata = run_functionalized_fw_and_collect_metadata(
                         flat_fn,
+                        static_input_indices=aot_config.static_input_indices,
                         keep_input_mutations=aot_config.keep_inference_input_mutations,
                         is_train=needs_autograd,
                         pre_dispatch=aot_config.pre_dispatch,
                     )(*_dup_fake_script_obj(fake_flat_args))
 
+                fw_metadata.static_input_indices = aot_config.static_input_indices
                 req_subclass_dispatch = requires_subclass_dispatch(
                     fake_flat_args, fw_metadata
                 )
@@ -631,7 +634,7 @@ def _dup_fake_script_obj(fake_flat_args):
                             subclass_tangent_meta=fw_metadata.subclass_tangent_meta,
                             is_train=False,
                             tokens=fw_metadata.tokens,
-                            static_parameter_indices=fw_metadata.static_parameter_indices,
+                            static_input_indices=fw_metadata.static_input_indices,
                         )
 
         if fw_metadata.num_intermediate_bases > 0:
@@ -936,9 +939,10 @@ def aot_module_simplified(
     # Next, the input args
     full_args.extend(args)
 
+    static_input_indices = []
     if hasattr(mod, "graph"):
         # Non dynamo entrypoints can get to here...
-        for node in mod.graph.find_nodes(op="placeholder"):
+        for pos, node in enumerate(mod.graph.find_nodes(op="placeholder")):
             if hasattr(node, "_dynamo_source"):
                 # ... but not here!
                 if aot_autograd_arg_pos_to_source is None:
@@ -948,6 +952,11 @@ def aot_module_simplified(
                 seen_sources.add(source)
                 aot_autograd_arg_pos_to_source.append(source)
 
+                if "tensor_dict" in node.meta and node.meta["tensor_dict"].get(
+                    "_dynamo_static_input_type", None
+                ):
+                    static_input_indices.append(pos)
+
     if aot_autograd_arg_pos_to_source is not None:
         assert len(full_args) == len(aot_autograd_arg_pos_to_source)
 
@@ -968,6 +977,7 @@ def aot_module_simplified(
         keep_inference_input_mutations=keep_inference_input_mutations,
         dynamic_shapes=dynamic_shapes,
         aot_autograd_arg_pos_to_source=aot_autograd_arg_pos_to_source,
+        static_input_indices=static_input_indices,
         is_export=False,
         no_tangents=False,
         cache_key=None,
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
@@ -137,7 +137,7 @@ def get_static_input_idxs(num_fixed):
     if not context or not context.fw_metadata:
         return fixed
 
-    return fixed + context.fw_metadata.static_parameter_indices
+    return fixed + context.fw_metadata.static_input_indices
 
 
 @functools.lru_cache(None)
@@ -1254,7 +1254,7 @@ def fw_compiler_freezing(
                 params_flat[i] = None
 
         if tracing_context.fw_metadata:
-            static_input_idxs += tracing_context.fw_metadata.static_parameter_indices
+            static_input_idxs += tracing_context.fw_metadata.static_input_indices
 
     with mock.patch.object(fake_mode, "allow_non_fake_inputs", True):
         optimized_function = inner_compile(