fix: Add support for truncate_long_and_double

gs-olive · gs-olive · commit 26a305920ae0 · 2023-06-05T13:05:50.000-07:00
- Add Dynamo compile support for `truncate_long_and_double` compilation
argument by intercepting long/double type inputs and casting them to
their 32-bit counterparts prior to usage in TRT-accelerated subgraphs,
then casting back if necessary
- Add robust logic to handle 64-bit inputs and outputs
- Add test cases for long and double scenarios
- Centralize truncation utility for later use in Dynamo export path
diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -17,7 +17,7 @@
     get_submod_inputs,
 )
 
-from torch_tensorrt.dynamo.backend.utils import repair_long_or_double_input
+from torch_tensorrt.dynamo.common import repair_long_or_double_inputs
 from torch_tensorrt.dynamo.backend.conversion import convert_module
 
 from torch._dynamo.backends.common import fake_tensor_unsupported
@@ -168,33 +168,9 @@ def _compile_module(
 
         # Handle long/double inputs if requested by the user
         if settings.truncate_long_and_double:
-            num_submodule_inputs = len(submodule_inputs)
-
-            # For each input to the TRT subgraph, check if its type is long/double
-            for position in range(num_submodule_inputs):
-                param = submodule_inputs[position]
-
-                # If the data type of the input is long/double, insert necessary
-                # casts to replace the operation
-                if param.dtype in (torch.int64, torch.float64):
-                    submodule_outputs = submodule(*submodule_inputs)
-                    repair_long_or_double_input(
-                        partitioned_module,
-                        position,
-                        name,
-                        submodule_outputs,
-                        param.dtype,
-                    )
-
-                    # Repair submodule inputs in accordance with inserted casts
-                    dtype_32bit = (
-                        torch.int32 if (param.dtype == torch.int64) else torch.float32
-                    )
-                    submodule_inputs = (
-                        submodule_inputs[:position]
-                        + (param.to(dtype_32bit),)
-                        + submodule_inputs[position + 1 :]
-                    )
+            submodule_inputs = repair_long_or_double_inputs(
+                partitioned_module, submodule, submodule_inputs
+            )
 
         # Create TRT Module from submodule
         trt_mod = convert_module(
diff --git a/py/torch_tensorrt/dynamo/backend/test/test_backend_compiler.py b/py/torch_tensorrt/dynamo/backend/test/test_backend_compiler.py
@@ -4,7 +4,7 @@
 from copy import deepcopy
 from torch_tensorrt.dynamo import compile
 from utils import lower_graph_testing
-from torch_tensorrt.dynamo.common.test_utils import DECIMALS_OF_AGREEMENT
+from torch_tensorrt.dynamo.common_utils.test_utils import DECIMALS_OF_AGREEMENT
 
 
 class TestTRTModuleNextCompilation(TestCase):
@@ -169,5 +169,116 @@ def forward(self, x, y):
         )
 
 
+class Test64BitInput(TestCase):
+    def test_float64_input_full_support(self):
+        class FullySupportedMultiOp(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.ops.aten.mean.dim(
+                    torch.ops.aten.mul.Tensor(torch.ops.aten.add.Tensor(x, y), 2), [0]
+                )
+
+        fx_graph = torch.fx.symbolic_trace(FullySupportedMultiOp())
+        partitioned_graph = partition(deepcopy(fx_graph), min_block_size=3)
+
+        self.assertEquals(
+            len(list(partitioned_graph.named_children())),
+            1,
+            "All operators are supported, there should be one segment",
+        )
+
+        inputs = [
+            torch.randint(-5, 5, (16, 7), dtype=torch.double).cuda(),
+            torch.randint(-5, 5, (16, 7), dtype=torch.double).cuda(),
+        ]
+
+        torch._dynamo.reset()
+
+        # Validate that the results between Torch and Torch-TRT are similar
+        optimized_model = compile(
+            fx_graph,
+            inputs,
+            min_block_size=1,
+            pass_through_build_failures=True,
+            truncate_long_and_double=True,
+            debug=True,
+        )
+        optimized_model_results = optimized_model(*inputs).detach().cpu()
+        torch_model_results = fx_graph(*inputs).detach().cpu()
+
+        max_diff = float(
+            torch.max(torch.abs(optimized_model_results - torch_model_results))
+        )
+        self.assertAlmostEqual(
+            max_diff,
+            0,
+            DECIMALS_OF_AGREEMENT,
+            f"TRT outputs don't match with the original model.",
+        )
+
+    def test_int64_input_partial_support(self):
+        class PartiallySupportedMultiOp(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.ops.aten.div.Tensor_mode(
+                    x, torch.ops.aten.add.Tensor(y, y), rounding_mode="floor"
+                )
+
+        fx_graph = torch.fx.symbolic_trace(PartiallySupportedMultiOp())
+        unexpected_ops = {torch.ops.aten.add.Tensor}
+
+        inputs = [
+            torch.randint(-40, 40, (16, 7, 5), dtype=torch.long).cuda(),
+            torch.randint(1, 40, (16, 7, 5), dtype=torch.long).cuda(),
+        ]
+
+        (unexpected_ops_seen, _, partitioned_graphs,) = lower_graph_testing(
+            fx_graph,
+            inputs,
+            unexpected_ops=unexpected_ops,
+            min_block_size=1,
+            torch_executed_ops={"torch.ops.aten.add.Tensor"},
+            testing_partitioning=True,
+        )
+
+        self.assertEquals(
+            len(unexpected_ops_seen),
+            0,
+            f"The following unexpected ops were encountered: {unexpected_ops_seen}",
+        )
+        self.assertEquals(
+            len(partitioned_graphs),
+            1,
+            "Without control flow breaks, there should only be a single graph",
+        )
+        self.assertEquals(
+            len(list(partitioned_graphs[0].named_children())),
+            1,
+            "Certain operators are set to run in Torch, expected 1 segment",
+        )
+
+        torch._dynamo.reset()
+
+        # Validate that the results between Torch and Torch-TRT are similar
+        optimized_model = compile(
+            fx_graph,
+            inputs,
+            min_block_size=1,
+            pass_through_build_failures=True,
+            truncate_long_and_double=True,
+            debug=True,
+        )
+        optimized_model_results = optimized_model(*inputs).detach().cpu()
+        torch_model_results = fx_graph(*inputs).detach().cpu()
+
+        max_diff = float(
+            torch.max(torch.abs(optimized_model_results - torch_model_results))
+        )
+        self.assertAlmostEqual(
+            max_diff,
+            0,
+            DECIMALS_OF_AGREEMENT,
+            f"TRT outputs don't match with the original model.",
+        )
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/py/torch_tensorrt/dynamo/backend/utils.py b/py/torch_tensorrt/dynamo/backend/utils.py
@@ -1,5 +1,4 @@
 import torch
-from torch.fx.node import _get_qualified_name
 from typing import Any, Union, Sequence, Dict
 from torch_tensorrt import _Input, Device
 
@@ -66,149 +65,3 @@ def prepare_device(device: Union[Device, torch.device]) -> torch.device:
         )
 
     return device
-
-
-def _extract_downstream_get_nodes(
-    module_node: torch.fx.Node, output_indices: Sequence[int]
-) -> Sequence[torch.fx.Node]:
-    """Extracts downstream users of a node which get the item at a particular index
-
-    Certain module-type nodes have multiple outputs (tuple of outputs). This function
-    returns downstream nodes which call the _operator.getitem function, which extracts
-    the element at a particular index in the tuple
-
-    Args:
-        module_node: FX module-type node to analyze
-        output_index: Indices in the module node output to search for
-    Returns:
-        List of nodes which get the item at the specified index in the module node output
-    """
-    get_nodes = []
-
-    # Iterate over all downstream users of the node object
-    for user in module_node.users:
-        # If the user is a "get" node accessing the specified index, store it
-        if _get_qualified_name(user.target) == "_operator.getitem" and (
-            user.args[1] in output_indices
-        ):
-            get_nodes.append(user)
-
-    return get_nodes
-
-
-def repair_long_or_double_input(
-    gm: torch.fx.GraphModule,
-    position: int,
-    submodule_name: str,
-    submodule_outputs: Union[torch.Tensor, Sequence[torch.Tensor]],
-    dtype: torch.dtype,
-):
-    """Fixes Long/Double type inputs to TRT-accelerated subgraphs
-
-    In-Place modifies the provided graph
-
-    Inserts a cast to the 32-bit equivalent type for TRT, then if necessary,
-    inserts an upcast back to the 64-bit type for subsequent Torch operations
-
-    Args:
-        gm: FX GraphModule enclosing the TRT subgraph
-        position: Index in the submodule inputs at which the long or double input is found
-        submodule_name: Name of TRT-accelerated subgraph module in FX graph
-        submodule_outputs: Output tensor(s) of TRT-accelerated subgraph (used for dtypes/structure)
-        dtype: Data type of tensor at position in submodule (double/long)
-    """
-    assert dtype in (
-        torch.int64,
-        torch.float64,
-    ), f"dtype argument must be torch.int64 or torch.float64, got {dtype}"
-
-    # Determine target data type in 32 and 64 bit forms
-    dtype_64bit = dtype
-    dtype_32bit = torch.int32 if (dtype == torch.int64) else torch.float32
-
-    # Find the node representing the submodule in the graph
-    module_node = None
-
-    # Iterate over all nodes in the graph, seeking target module name match
-    for n in gm.graph.nodes:
-        if n.op == "call_module" and str(n.target) == submodule_name:
-            module_node = n
-            break
-
-    if module_node is None:
-        raise AssertionError(
-            f"Sought module node {submodule_name}, could not find in graph:\n{gm.graph}"
-        )
-
-    # Extract the 64-bit node of the input
-    node_64bit = module_node.all_input_nodes[position]
-
-    # Prior to the module, insert a cast to the 32-bit equivalent node
-    with gm.graph.inserting_before(module_node):
-        node_32bit = gm.graph.call_function(
-            torch.ops.aten._to_copy.default,
-            args=(node_64bit,),
-            kwargs={"dtype": dtype_32bit},
-        )
-
-    # Replace 64-bit input to TRT module with new 32-bit cast node
-    module_node.replace_input_with(node_64bit, node_32bit)
-
-    output_positions_64bit = set()
-    outputs_list = (
-        [submodule_outputs]
-        if isinstance(submodule_outputs, torch.Tensor)
-        else submodule_outputs
-    )
-
-    # Determine if any outputs of the model are 64-bit type and store their indices
-    for output_position, output in enumerate(outputs_list):
-        if output.dtype == dtype_64bit:
-            output_positions_64bit.add(output_position)
-
-    # Only enter this code block if there exists a 64-bit output
-    # This implies a cast is needed, since TRT cannot output 64-bit tensors
-    if output_positions_64bit:
-        # Determine whther the outputs of the module are tuple-type or not
-        is_tuple_output = False
-        if isinstance(submodule_outputs, tuple):
-            is_tuple_output = True
-
-        if not is_tuple_output:
-            # If the output is a single tensor, insert a cast back to int64
-            with gm.graph.inserting_after(module_node):
-                cast_node_64bit = gm.graph.call_function(
-                    torch.ops.aten._to_copy.default,
-                    args=(module_node,),
-                    kwargs={"dtype": dtype_64bit},
-                )
-
-            # Replace all uses of the TRT module (except the cast node) with the 64-bit equivalent
-            module_node.replace_all_uses_with(
-                cast_node_64bit, delete_user_cb=lambda user: (user != cast_node_64bit)
-            )
-
-        else:
-            # If the output is a tuple of tensors, extract downstream users for each 64-bit output
-            get_nodes = _extract_downstream_get_nodes(
-                module_node, output_positions_64bit
-            )
-
-            # For each downstream user, append a cast node back to the 64-bit precision
-            for get_node in get_nodes:
-                with gm.graph.inserting_after(get_node):
-                    cast_node_64bit = gm.graph.call_function(
-                        torch.ops.aten._to_copy.default,
-                        args=(get_node,),
-                        kwargs={"dtype": torch.int64},
-                    )
-
-                get_node.replace_all_uses_with(
-                    cast_node_64bit,
-                    delete_user_cb=lambda user: (user != cast_node_64bit),
-                )
-
-    # Clean up graph and ensure invariants are preserved
-    gm.graph.eliminate_dead_code()
-    gm.graph.lint()
-    gm.recompile()
diff --git a/py/torch_tensorrt/dynamo/common/__init__.py b/py/torch_tensorrt/dynamo/common/__init__.py
@@ -1,4 +1,4 @@
 from ._settings import CompilationSettings
-
 from .fx2trt import TRTInterpreter, TRTInterpreterResult
 from .input_tensor_spec import InputTensorSpec
+from .truncate_long_and_double import repair_long_or_double_inputs
diff --git a/py/torch_tensorrt/dynamo/common/truncate_long_and_double.py b/py/torch_tensorrt/dynamo/common/truncate_long_and_double.py