fix: Remove input aliasing with builtin ops

gs-olive · gs-olive · commit 80a8da2fb833 · 2023-09-13T12:23:19.000-07:00
- Add replacements for inplace builtin operators with their out-of-place
equivalents
- Add utility to automatically perform replacement prior to AOT tracing
- Add test cases to verify inplace operators are replaced accurately
diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -2,17 +2,19 @@
 
 import logging
 import unittest
-from typing import Any, Callable, Dict, Optional, Sequence
+from typing import Any, Callable, Sequence
 
 import torch
 import torch._dynamo as td
-import torch.utils._pytree as pytree
 from torch._dynamo.utils import detect_fake_mode
-from torch._functorch.aot_autograd import _aot_export_function
-from torch._ops import OpOverload
+from torch._functorch.aot_autograd import aot_export_joint_simple
 from torch_tensorrt.dynamo import CompilationSettings
 from torch_tensorrt.dynamo.compile import compile_module
-from torch_tensorrt.dynamo.lowering import apply_lowering_passes, get_decompositions
+from torch_tensorrt.dynamo.lowering import (
+    apply_lowering_passes,
+    get_decompositions,
+    replace_builtin_inplace_ops,
+)
 from torch_tensorrt.dynamo.lowering._pre_aot_lowering import pre_aot_substitutions
 from torch_tensorrt.dynamo.utils import parse_dynamo_kwargs
 
@@ -74,8 +76,10 @@ def _pretraced_backend(
         with unittest.mock.patch.object(
             fake_mode, "allow_non_fake_inputs", True
         ), fake_mode:
+            replace_builtin_inplace_ops(gm)
+
             # Invoke AOTAutograd to translate operators to aten
-            gm = aot_export_for_compile(
+            gm = aot_export_joint_simple(
                 gm,
                 sample_inputs,
                 decompositions=get_decompositions(
@@ -110,53 +114,3 @@ def _pretraced_backend(
                 + "specify pass_through_build_failures=False."
             )
             raise
-
-
-def aot_export_for_compile(
-    func: torch.fx.GraphModule,
-    args: Sequence[torch.Tensor],
-    *,
-    decompositions: Optional[Dict[OpOverload, Callable[[Any], Any]]] = None,
-) -> torch.fx.GraphModule:
-    """Adapted from:
-    https://github.com/pytorch/pytorch/blob/1a5fdc2458b98697c75c32eb6f4b8b34d76429cf/torch/_functorch/aot_autograd.py#L4084-L4158
-
-    Removed check for input aliasing in resultant subgraph - TRT is functional-only
-
-    Exports the function to ATen for torch compile
-    """
-    # Trace function with input arguments and decompositions
-    with torch.no_grad():
-        fx_g, metadata, in_spec, out_spec = _aot_export_function(
-            func,
-            args,
-            decompositions=decompositions,
-        )
-
-    # No input mutations
-    if (
-        len([x for x in metadata.input_info if x.mutates_data or x.mutates_metadata])
-        != 0
-    ):
-        raise RuntimeError(
-            f"aot_export_joint_simple does not support input mutations. {str(metadata)}"
-        )
-    # No pytrees
-    if type(in_spec) == pytree.LeafSpec:
-        raise RuntimeError(
-            f"aot_export_for_compile requires inputs to be a single list/tuple. in_spec={str(in_spec)}"
-        )
-    if len([x for x in in_spec.children_specs if type(x) != pytree.LeafSpec]) != 0:
-        raise RuntimeError(
-            f"aot_export_for_compile requires individual inputs not to be pytrees. in_spec={str(in_spec)}"
-        )
-    if type(out_spec) == pytree.LeafSpec:
-        raise RuntimeError(
-            f"aot_export_for_compile requires outputs to be a single list/tuple. out_spec={str(out_spec)}"
-        )
-    if len([x for x in out_spec.children_specs if type(x) != pytree.LeafSpec]) != 0:
-        raise RuntimeError(
-            f"aot_export_for_compile requires individual outputs not to be pytrees. out_spec={str(out_spec)}"
-        )
-
-    return fx_g
diff --git a/py/torch_tensorrt/dynamo/lowering/__init__.py b/py/torch_tensorrt/dynamo/lowering/__init__.py
@@ -2,5 +2,6 @@
 from ._fusers import *  # noqa: F401
 from ._pre_aot_lowering import SUBSTITUTION_REGISTRY  # noqa: F401
 from ._pre_aot_lowering import register_substitution  # noqa: F401
+from ._replace_inplace_ops import replace_builtin_inplace_ops
 from .passes import add_lowering_pass, apply_lowering_passes
 from .substitutions import *  # noqa: F401
diff --git a/py/torch_tensorrt/dynamo/lowering/_replace_inplace_ops.py b/py/torch_tensorrt/dynamo/lowering/_replace_inplace_ops.py
@@ -0,0 +1,50 @@
+import logging
+import operator
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+BUILTIN_TRANSLATION = {
+    operator.ipow: operator.pow,
+    operator.imul: operator.mul,
+    operator.imatmul: operator.matmul,
+    operator.ifloordiv: operator.floordiv,
+    operator.itruediv: operator.truediv,
+    operator.imod: operator.mod,
+    operator.iadd: operator.add,
+    operator.isub: operator.sub,
+    operator.ilshift: operator.lshift,
+    operator.irshift: operator.rshift,
+    operator.iand: operator.and_,
+    operator.ixor: operator.xor,
+    operator.ior: operator.or_,
+}
+
+
+def replace_builtin_inplace_ops(gm: torch.fx.GraphModule) -> None:
+    """Replaces inplace builtins from Python's operator class
+
+    Replaces inplace builtins with out-of-place equivalent ops
+    """
+    for node in gm.graph.nodes:
+        # If a node uses one of the inplace builtins
+        # Replace it with its out-of-place equivalent
+        if node.target in BUILTIN_TRANSLATION:
+            out_of_place_op = BUILTIN_TRANSLATION[node.target]
+
+            # Replace inplace operator node and delete
+            with gm.graph.inserting_before(node):
+                out_of_place = gm.graph.call_function(
+                    out_of_place_op,
+                    args=node.args,
+                    kwargs=node.kwargs,
+                )
+
+            logger.debug(f"Replacing {node.target} with {out_of_place.target}")
+
+            node.replace_all_uses_with(out_of_place)
+            gm.graph.erase_node(node)
+
+    gm.graph.lint()
+    gm.recompile()
diff --git a/tests/py/dynamo/backend/test_specialized_models.py b/tests/py/dynamo/backend/test_specialized_models.py
@@ -57,6 +57,7 @@ def forward(self, x):
         self.assertAlmostEqual(
             max_diff,
             0,
+            DECIMALS_OF_AGREEMENT,
             msg=f"MulInt TRT outputs don't match with the original model.",
         )
         torch._dynamo.reset()
@@ -113,6 +114,7 @@ def forward(self, x):
         self.assertAlmostEqual(
             max_diff,
             0,
+            DECIMALS_OF_AGREEMENT,
             msg=f"AddFloat TRT outputs don't match with the original model.",
         )
 
@@ -236,5 +238,88 @@ def forward(self, x):
         torch._dynamo.reset()
 
 
+class TestInputModifications(TestCase):
+    def test_input_modifications_add(self):
+        class InplaceAdd(torch.nn.Module):
+            def forward(self, x):
+                x += 3
+                y = x + 1
+                return y
+
+        inputs = [
+            torch.rand(
+                3,
+                5,
+                7,
+            ).cuda(),
+        ]
+
+        fx_graph = torch.fx.symbolic_trace(InplaceAdd())
+
+        # Validate that the results between Torch and Torch-TRT are similar
+        optimized_model = torch_tensorrt.compile(
+            fx_graph,
+            "torch_compile",
+            inputs,
+            min_block_size=1,
+            pass_through_build_failures=True,
+        )
+        optimized_model_results = optimized_model(*inputs).detach().cpu()
+        torch_model_results = fx_graph(*inputs).detach().cpu()
+
+        max_diff = float(
+            torch.max(torch.abs(optimized_model_results - torch_model_results))
+        )
+        self.assertAlmostEqual(
+            max_diff,
+            0,
+            DECIMALS_OF_AGREEMENT,
+            msg=f"InplaceAdd TRT outputs don't match with the original model.",
+        )
+        torch._dynamo.reset()
+
+    def test_input_modifications_mul(self):
+        class InplaceMul(torch.nn.Module):
+            def forward(self, x):
+                x *= 5.0
+                x *= 1.9
+                y = x + 1
+                y /= 1.3
+                return y
+
+        inputs = [
+            torch.rand(
+                1,
+                3,
+                5,
+                7,
+            ).cuda(),
+        ]
+
+        fx_graph = torch.fx.symbolic_trace(InplaceMul())
+
+        # Validate that the results between Torch and Torch-TRT are similar
+        optimized_model = torch_tensorrt.compile(
+            fx_graph,
+            "torch_compile",
+            inputs,
+            min_block_size=1,
+            pass_through_build_failures=True,
+        )
+        optimized_model_results = optimized_model(*inputs).detach().cpu()
+        torch_model_results = fx_graph(*inputs).detach().cpu()
+
+        max_diff = float(
+            torch.max(torch.abs(optimized_model_results - torch_model_results))
+        )
+        self.assertAlmostEqual(
+            max_diff,
+            0,
+            DECIMALS_OF_AGREEMENT,
+            msg=f"InplaceMul TRT outputs don't match with the original model.",
+        )
+        torch._dynamo.reset()
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/tests/py/dynamo/testing_utilities.py b/tests/py/dynamo/testing_utilities.py
@@ -5,9 +5,13 @@
 
 import torch
 from torch._dynamo.utils import detect_fake_mode
+from torch._functorch.aot_autograd import aot_export_joint_simple
 from torch_tensorrt.dynamo import partitioning
-from torch_tensorrt.dynamo.backend.backends import aot_export_for_compile
-from torch_tensorrt.dynamo.lowering import apply_lowering_passes, get_decompositions
+from torch_tensorrt.dynamo.lowering import (
+    apply_lowering_passes,
+    get_decompositions,
+    replace_builtin_inplace_ops,
+)
 from torch_tensorrt.dynamo.lowering._pre_aot_lowering import pre_aot_substitutions
 
 DECIMALS_OF_AGREEMENT = 4
@@ -39,8 +43,10 @@ def fx_dynamo_testing_backend(
     with unittest.mock.patch.object(
         fake_mode, "allow_non_fake_inputs", True
     ), fake_mode:
+        replace_builtin_inplace_ops(gm)
+
         # Invoke AOTAutograd to translate operators to aten
-        gm = aot_export_for_compile(
+        gm = aot_export_joint_simple(
             gm,
             sample_inputs,
             decompositions=get_decompositions(),