feat: Add preliminary support for freezing tensors in Dynamo

gs-olive · gs-olive · commit e4df38206d2e · 2023-08-24T19:48:43.000-07:00
diff --git a/py/torch_tensorrt/dynamo/backend/aot_module.py b/py/torch_tensorrt/dynamo/backend/aot_module.py
@@ -0,0 +1,128 @@
+from typing import Callable, Dict, Optional
+
+import torch
+import torch.utils._pytree as pytree
+from torch import nn
+from torch._functorch.aot_autograd import (
+    AOT_COUNTER,
+    AOTConfig,
+    create_aot_dispatcher_function,
+    create_functional_call,
+)
+from torch._functorch.partitioners import default_partition
+from torch._subclasses import FakeTensor
+
+
+def aot_module(
+    mod: nn.Module,
+    args,
+    fw_compiler: Callable,
+    partition_fn: Callable = default_partition,
+    decompositions: Optional[Dict] = None,
+    keep_inference_input_mutations=False,
+) -> nn.Module:
+    """
+    Adapted from:
+    https://github.com/pytorch/pytorch/blob/cce2b7e3c95a7505b41bdfc53939d84d56e31260/torch/_functorch/aot_autograd.py#L3656-L3776
+
+    This is the simplified or low overhead version of aot_module. For frontends
+    like TorchDynamo, the input functions/modules to AOT are static and have
+    unpacked inputs/outputs. This gives us an opportunity to remove the
+        (1) pytree overhead to parse inputs/outputs,
+        (2) AOT Autograd cache,
+        (3) Reading of params/buffers in every forward call
+
+
+    :func:`aot_module_simplified` removes these overheads.
+    """
+
+    params = {
+        **dict(mod.named_parameters(remove_duplicate=False)),
+        **dict(mod.named_buffers(remove_duplicate=False)),
+    }
+    params_flat, params_spec = pytree.tree_flatten(params)
+    params_flat = list(params_flat)
+    params_len = len(params_flat)
+
+    functional_call = create_functional_call(mod, params_spec, params_len)
+
+    seen_sources = set()
+
+    full_args = []
+    # First, the params
+    full_args.extend(params_flat)
+
+    if torch._guards.TracingContext.get():
+        torch._guards.TracingContext.get().params_flat = params_flat
+
+    aot_autograd_arg_pos_to_source = None
+    # Then, the params 1:1 mapped sources, if relevant.
+    if hasattr(mod, "_param_name_to_source"):
+        aot_autograd_arg_pos_to_source = []
+        # We now know this came from dynamo, and (1) we care about guards,
+        # so setting up aot_autograd_arg_pos_to_source for downstream dedup guards
+        # can now be done safely. (2) Dynamo logic protects the 1:1 sizing below.
+        for name in params.keys():
+            assert name in mod._param_name_to_source, f"{name} not found."
+            source = mod._param_name_to_source[name]
+            assert source not in seen_sources, source
+            seen_sources.add(source)
+            aot_autograd_arg_pos_to_source.append(source)
+
+    # Next, the input args
+    full_args.extend(args)
+
+    if hasattr(mod, "graph"):
+        # Non dynamo entrypoints can get to here...
+        for i, node in enumerate(mod.graph.nodes):
+            if node.op == "placeholder":
+                if hasattr(node, "_dynamo_source"):
+                    # ... but not here!
+                    if aot_autograd_arg_pos_to_source is None:
+                        aot_autograd_arg_pos_to_source = []
+                    source = node._dynamo_source
+                    assert source not in seen_sources, source
+                    seen_sources.add(source)
+                    aot_autograd_arg_pos_to_source.append(source)
+
+    if aot_autograd_arg_pos_to_source is not None:
+        assert len(full_args) == len(aot_autograd_arg_pos_to_source)
+
+    dynamic_shapes = False
+    for x in full_args:
+        if isinstance(x, FakeTensor):
+            dynamic_shapes = x.fake_mode.shape_env is not None
+            break
+
+    aot_config = AOTConfig(
+        fw_compiler=fw_compiler,
+        bw_compiler=fw_compiler,
+        inference_compiler=fw_compiler,
+        partition_fn=partition_fn,
+        decompositions=decompositions,
+        num_params_buffers=params_len,
+        aot_id=next(AOT_COUNTER),
+        keep_inference_input_mutations=keep_inference_input_mutations,
+        dynamic_shapes=dynamic_shapes,
+        aot_autograd_arg_pos_to_source=aot_autograd_arg_pos_to_source,
+        is_export=False,
+        no_tangents=False,
+    )
+
+    compiled_fn = create_aot_dispatcher_function(
+        functional_call,
+        full_args,
+        aot_config,
+    )
+
+    def forward(*runtime_args):
+        full_args = []
+        full_args.extend(runtime_args)
+        return compiled_fn(full_args)
+
+    # Just for convenience
+    forward.zero_grad = mod.zero_grad
+    forward.named_parameters = mod.named_parameters
+    forward.named_buffers = mod.named_buffers
+
+    return forward
diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -6,13 +6,17 @@
 
 import torch
 import torch._dynamo as td
-from torch._functorch.aot_autograd import aot_module_simplified, make_boxed_compiler
+from torch._functorch.aot_autograd import make_boxed_compiler
+from torch._guards import TracingContext
 from torch_tensorrt.dynamo import CompilationSettings
 from torch_tensorrt.dynamo.compile import compile_module
 from torch_tensorrt.dynamo.lowering._decompositions import get_decompositions
+from torch_tensorrt.dynamo.lowering._freeze_aot_graph import freeze_autograd_gm
 from torch_tensorrt.dynamo.lowering._pre_aot_lowering import pre_aot_substitutions
 from torch_tensorrt.dynamo.utils import parse_dynamo_kwargs
 
+from .aot_module import aot_module
+
 logger = logging.getLogger(__name__)
 
 
@@ -33,8 +37,9 @@ def torch_tensorrt_backend(
 
     DEFAULT_BACKEND = aot_torch_tensorrt_aten_backend
 
-    compiled_mod: torch.nn.Module = DEFAULT_BACKEND(gm, sample_inputs, **kwargs)
-    return compiled_mod
+    TracingContext.get().fake_mode.allow_non_fake_inputs = True
+
+    return DEFAULT_BACKEND(gm, sample_inputs, **kwargs)
 
 
 @td.register_backend(name="aot_torch_tensorrt_aten")  # type: ignore[misc]
@@ -52,7 +57,7 @@ def aot_torch_tensorrt_aten_backend(
     gm = pre_aot_substitutions(gm)
 
     # Invoke AOTAutograd to translate operators to aten
-    return aot_module_simplified(
+    return aot_module(
         gm,
         sample_inputs,
         fw_compiler=make_boxed_compiler(custom_backend),
@@ -77,9 +82,16 @@ def _pretraced_backend(
     try:
         logger.debug("Post-AOT Autograd graph:\n" + str(gm.graph))
 
+        frozen_gm, unfrozen_indices = freeze_autograd_gm(gm, sample_inputs)
+        nonfrozen_inputs = [sample_inputs[idx] for idx in unfrozen_indices]
+
+        frozen_gm.graph.eliminate_dead_code()
+        frozen_gm.graph.lint()
+        frozen_gm.recompile()
+
         trt_compiled = compile_module(
-            gm,
-            sample_inputs,
+            frozen_gm,
+            nonfrozen_inputs,
             settings=settings,
         )
         return trt_compiled
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -3,14 +3,15 @@
 from datetime import datetime
 from typing import Any, Callable, Dict, List, NamedTuple, Optional, Sequence, Set
 
-import numpy
+import numpy as np
 
 # @manual=//deeplearning/trt/python:py_tensorrt
 import tensorrt as trt
 import torch
 import torch.fx
 from torch.fx.node import _get_qualified_name
 from torch.fx.passes.shape_prop import TensorMetadata
+from torch.utils._python_dispatch import _disable_current_modes
 from torch_tensorrt._Input import Input
 from torch_tensorrt.dynamo.conversion.converter_utils import get_node_name
 from torch_tensorrt.fx.observer import Observer
@@ -169,7 +170,7 @@ def run(
 
         cache = None
         if timing_cache:
-            cache_file = numpy.array(timing_cache)
+            cache_file = np.array(timing_cache)
             cache = builder_config.create_timing_cache(cache_file.tobytes())
         else:
             cache = builder_config.create_timing_cache(b"")
@@ -323,6 +324,21 @@ def call_function(self, target: str, args: Any, kwargs: Any) -> Any:
         assert self._cur_node_name is not None
         return converter(self.network, target, args, kwargs, self._cur_node_name)
 
+    def get_attr(self, target: str, args: Any, kwargs: Any) -> np.ndarray:
+        with _disable_current_modes():
+            from torch_tensorrt.fx.converters import to_numpy
+
+            frozen_attr = self.fetch_attr(target)
+
+            if isinstance(frozen_attr, torch.nn.Parameter):
+                constant_tensor = frozen_attr.data
+            else:
+                constant_tensor = frozen_attr
+
+            network_constant = to_numpy(constant_tensor)
+
+        return network_constant
+
     def call_method(self, target: str, args: Any, kwargs: Any) -> Any:
         assert isinstance(target, str)
         converter = CONVERTERS.get(self._cur_node)
@@ -344,6 +360,17 @@ def output(self, target: str, args: Any, kwargs: Any) -> List[Any]:
         else:
             outputs = (args[0],)
 
+        for output_idx in range(len(outputs)):
+            from torch_tensorrt.fx.converters import get_trt_tensor
+
+            output = outputs[output_idx]
+
+            if not isinstance(output, trt.tensorrt.ITensor):
+                new_output = get_trt_tensor(self.network, output, target)
+                outputs = (
+                    outputs[:output_idx] + (new_output,) + outputs[output_idx + 1 :]
+                )
+
         if not all(isinstance(output, trt.tensorrt.ITensor) for output in outputs):
             raise RuntimeError("TensorRT requires all outputs to be Tensor!")
 
diff --git a/py/torch_tensorrt/dynamo/lowering/__init__.py b/py/torch_tensorrt/dynamo/lowering/__init__.py
@@ -1,4 +1,5 @@
 from ._decompositions import get_decompositions  # noqa: F401
+from ._freeze_aot_graph import *  # noqa: F401
 from ._fusers import *  # noqa: F401
 from ._pre_aot_lowering import SUBSTITUTION_REGISTRY  # noqa: F401
 from ._pre_aot_lowering import register_substitution  # noqa: F401
diff --git a/py/torch_tensorrt/dynamo/lowering/_freeze_aot_graph.py b/py/torch_tensorrt/dynamo/lowering/_freeze_aot_graph.py
@@ -0,0 +1,74 @@
+import unittest
+from typing import List, Sequence, Tuple
+
+import torch
+import torch.fx.traceback as fx_traceback
+from torch._dynamo.utils import detect_fake_mode
+from torch._functorch.compile_utils import fx_graph_cse
+from torch._inductor.compile_fx import fake_tensor_prop
+from torch._inductor.freezing import constant_fold, replace_params_with_constants
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.fx.passes.tools_common import legalize_graph
+
+
+def freeze_autograd_gm(
+    aot_autograd_gm: torch.fx.GraphModule,
+    example_inputs: Sequence[torch._subclasses.FakeTensor],
+) -> Tuple[torch.fx.GraphModule, List[int]]:
+    """
+    Adapted from:
+    https://github.com/pytorch/pytorch/blob/750b9b359f06cb8b8c2d5b6118bba636e2112cbb/torch/_inductor/freezing.py#L186-L243
+
+    Inlines parameters that are not mutated into constants and optimizes the graph through constant propagation
+    and other techniques. If enabled, the function also discards the original parameters of the module for memory efficiency.
+
+    Assumes that this function is run in dynamo tracing post aot_autograd.
+
+    Args:
+        aot_autograd_gm (torch.fx.GraphModule): The aot_autograd constructed GraphModule to be frozen.
+        example_inputs (List[torch.Tensor]): A list of example input tensors to be used in the freezing process.
+
+    Returns:
+        Tuple[torch.fx.GraphModule, List[int]]: A tuple containing the frozen GraphModule and a list of indices
+        of the inputs that were preserved (not turned into constants).
+    """
+    # Extract necessary metadata and parameters
+    fw_metadata = torch._guards.TracingContext.get().fw_metadata
+    params_flat = torch._guards.TracingContext.get().params_flat
+    assert fw_metadata is not None and params_flat is not None
+
+    # Replace placeholders with get_attr nodes
+    preserved_arg_indices = replace_params_with_constants(
+        aot_autograd_gm, params_flat, fw_metadata
+    )
+
+    constant_fold(aot_autograd_gm)
+
+    fake_mode = detect_fake_mode(example_inputs)
+
+    # constant params will be real tensors, not fake
+    # TODO: fake_mode should should enable py dispatcher if its symbolic ?
+    with unittest.mock.patch.object(
+        fake_mode, "allow_non_fake_inputs", True
+    ), fake_mode:
+        args = [e for i, e in enumerate(example_inputs) if i in preserved_arg_indices]
+        with fx_traceback.preserve_node_meta():
+            aot_autograd_gm = make_fx(aot_autograd_gm, _allow_non_fake_inputs=True)(
+                *args
+            )
+
+    # TODO - further restrict cse ? right now needed to dedup aliasing ops
+    cse_graph = fx_graph_cse(aot_autograd_gm.graph)
+    aot_autograd_gm.graph = cse_graph
+    aot_autograd_gm.recompile()
+
+    # Make sure meta['val'] is properly setup(weight conversion
+    # or decompose_unfused_batchnorms lost meta['val']).
+    aot_example_inputs = [example_inputs[ind] for ind in preserved_arg_indices]
+    fake_tensor_prop(aot_autograd_gm, aot_example_inputs, True)
+
+    # TODO - apply legalization in pattern matcher
+    legalize_graph(aot_autograd_gm)
+    constant_fold(aot_autograd_gm)
+
+    return aot_autograd_gm, preserved_arg_indices
diff --git a/py/torch_tensorrt/dynamo/partitioning/_global_partitioner.py b/py/torch_tensorrt/dynamo/partitioning/_global_partitioner.py
@@ -121,7 +121,10 @@ def is_node_supported(
     ) -> bool:
         node_name = ConverterRegistry.qualified_name_or_str(node.target)
 
-        if node in CONVERTERS and node_name not in self.torch_executed_ops:
+        if (
+            node.target in CONVERTERS.keys()
+            or (node.op == "get_attr" and "constant" in node_name)
+        ) and node_name not in self.torch_executed_ops:
             # If node is a proper, supported computational node, store the operator
             if not node.is_impure():
                 if node_name not in self.supported_operators:

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`from ._decompositions import get_decompositions # noqa: F401`
	`2`	`+from ._freeze_aot_graph import * # noqa: F401`
`2`	`3`	`from ._fusers import * # noqa: F401`
`3`	`4`	`from ._pre_aot_lowering import SUBSTITUTION_REGISTRY # noqa: F401`
`4`	`5`	`from ._pre_aot_lowering import register_substitution # noqa: F401`