[fx2trt] improve to_dtype (#48)

Wei Wei · Wei Wei · commit ac93baed5ada · 2022-06-03T17:54:12.000-07:00
Summary: Pull Request resolved: https://github.com/pytorch/fx2trt/pull/48 Currently, to_dtype can only support 1) to(dtype) This diff makes this op more capable of handling more cases: 2) to(torch.device) #gpu 3) to(torch.device, dtype) #gpu (Note: this ignores all push blocking failures!) Reviewed By: 842974287 Differential Revision: D35331003 fbshipit-source-id: 4dee2b3c7899805fa4f3c91d0a16207241396647
diff --git a/fx/converters/acc_ops_converters.py b/fx/converters/acc_ops_converters.py
@@ -14,6 +14,7 @@
 from fx2trt_oss.fx.types import *  # noqa: F403
 from fx2trt_oss.fx.utils import (
     torch_dtype_from_trt,
+    torch_dtype_to_trt,
     get_dynamic_dims,
 )
 from torch.fx.immutable_collections import immutable_list
@@ -1237,6 +1238,34 @@ def acc_ops_minimum(
     )
 
 
+@tensorrt_converter(acc_ops.device)
+def acc_ops_device(
+    network: TRTNetwork,
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> Union[TRTTensor, Sequence[TRTTensor]]:
+    # TRT always assume the device is cuda not cpu
+    return torch.device("cuda")
+
+@tensorrt_converter(acc_ops.to_dtype)
+def acc_ops_to_dtype(
+    network: TRTNetwork,
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> Union[TRTTensor, Sequence[TRTTensor]]:
+    input_val = kwargs["input"]
+    input_dtype = kwargs["acc_out_ty"].dtype
+    input_t = get_trt_tensor(network, input_val, f"{name}_input_t")
+
+    if input_dtype:
+        input_dtype = torch_dtype_to_trt(input_dtype)
+        input_t = type_cast(network, target, f"{name}_input", input_t, input_dtype)
+    return input_t
+
 
 @tensorrt_converter(acc_ops.logical_not)
 def acc_ops_logical_not(
diff --git a/test/converters/acc_op/test_to_dtype.py b/test/converters/acc_op/test_to_dtype.py
@@ -0,0 +1,90 @@
+import torch
+import fx2trt_oss.tracer.acc_tracer.acc_ops as acc_ops
+from torch.testing._internal.common_fx2trt import AccTestCase
+from torch.testing._internal.common_utils import run_tests
+from fx2trt_oss.fx.utils import LowerPrecision
+
+class TestNeFunctionConverter(AccTestCase):
+    def test_fp16(self):
+        class To(torch.nn.Module):
+            def forward(self, x):
+                return x.to(torch.float16)
+
+        input = torch.randn(2,2)
+        inputs = [
+            input,
+        ]
+        self.run_test(To(), inputs, expected_ops={acc_ops.to_dtype}, test_implicit_batch_dim = False, precision=LowerPrecision.FP16)
+
+    def test_fp32(self):
+        class To(torch.nn.Module):
+            def forward(self, x):
+                return x.to(torch.float32)
+
+        input = torch.randn(2,2).to(torch.float16)
+        inputs = [
+            input,
+        ]
+        self.run_test(To(), inputs, expected_ops={acc_ops.to_dtype}, test_implicit_batch_dim = False)
+
+    def test_cuda_fp16(self):
+        class To(torch.nn.Module):
+            def forward(self, x):
+                return x.to(torch.device('cuda:0'), torch.float16)
+
+        input = torch.randn(2,2)
+        inputs = [
+            input,
+        ]
+        self.run_test(To(), inputs, expected_ops={acc_ops.to_dtype}, test_implicit_batch_dim = False, precision=LowerPrecision.FP16)
+
+    def test_cuda(self):
+        class To(torch.nn.Module):
+            def forward(self, x):
+                x = x.to(torch.device('cuda'))
+                # append extra layer since to(device) is skipped in TRT
+                return x + torch.randn(2,2).cuda()
+
+        input = torch.randn(2,2)
+        inputs = [
+            input,
+        ]
+        self.run_test(To(), inputs, expected_ops={acc_ops.to_dtype, acc_ops.add}, test_implicit_batch_dim = False, precision=LowerPrecision.FP32)
+
+
+    def test_device(self):
+        class To(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.randn(2,2)
+            def forward(self, x):
+                idevice = x.device
+                a = self.a.to(idevice)
+                return x + a
+
+        input = torch.randn(2,2).cuda()
+        inputs = [
+            input,
+        ]
+        self.run_test(To(), inputs, expected_ops={}, test_implicit_batch_dim = False, precision=LowerPrecision.FP32)
+
+    def test_device_fp16(self):
+        class To(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.randn(2,2)
+            def forward(self, x):
+                idevice = x.device
+                a = self.a.to(idevice)
+                # fx tracer could not handle "to(idevice, torch.float16)"
+                # TypeError: to() received an invalid combination of arguments - got (Attribute, torch.dtype)
+                return a.to(torch.float16)
+
+        input = torch.randn(2,2).half().cuda()
+        inputs = [
+            input,
+        ]
+        self.run_test(To(), inputs, expected_ops={}, test_implicit_batch_dim = False, precision=LowerPrecision.FP16)
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/tracer/test_acc_tracer.py b/test/tracer/test_acc_tracer.py
@@ -2371,6 +2371,7 @@ def test_all_acc_ops_registered(self):
                 acc_ops.interpolate,
                 acc_ops.logical_and,
                 acc_ops.logical_not,
-                acc_ops.ne
+                acc_ops.ne,
+                acc_ops.device,
             },
         )
diff --git a/tracer/acc_tracer/acc_ops.py b/tracer/acc_tracer/acc_ops.py
@@ -184,6 +184,10 @@ def sign(*, input):
 def size(*, input):
     return input.size()
 
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def device(*, input):
+    return input.device
 
 @register_custom_acc_mapper_fn(
     op_and_target=("call_function", getattr),
@@ -203,10 +207,15 @@ def custom_getattr_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
         input_obj.meta["type"] == torch.Tensor
     ), f"Expected torch.Tensor type for {input_obj.meta['type']}"
     assert (
-        attr_name == "shape"
+        attr_name == "shape" or attr_name == "device"
     ), f"Only supporting shape getattr for now, not {attr_name}"
+    if attr_name == "shape":
+        func = size
+    elif attr_name == "device":
+        func = device
+
     with node.graph.inserting_before(node):
-        size_node = node.graph.call_function(size, kwargs={"input": input_obj})
+        size_node = node.graph.call_function(func, kwargs={"input": input_obj})
         size_node.meta = node.meta.copy()
         return size_node
 
@@ -1993,29 +2002,52 @@ def custom_tensor_reshape_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.
 
 @register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
 @register_acc_op
-def to_dtype(input, acc_out_ty=None):
+def to_dtype(input, acc_out_ty=None, device=None):
     assert acc_out_ty is not None
-    return input.to(dtype=acc_out_ty.dtype)
+    return input.to(dtype=acc_out_ty.dtype, device=device)
 
 
 @register_custom_acc_mapper_fn(
     op_and_target=("call_method", "to"),
     arg_replacement_tuples=[
         ("input", "input"),
         ("dtype", "dtype"),
+        ("device", "device", this_arg_is_optional),
+
     ],
 )
 def custom_tensor_to_mapper(node: torch.fx.Node, _: nn.Module):
-    dest_dtype = node.kwargs["dtype"]
+    dest = node.kwargs["dtype"]
     mem_format = node.kwargs.get("memory_format")
-    device = node.kwargs.get("device")
-    assert dest_dtype is not None
+    dest_other = node.kwargs.get("device")
+    assert dest is not None
     assert mem_format is None or mem_format == torch.preserve_format
-    assert device is None
+
+    dest_dtype = dest_device=None
+    if isinstance(dest, torch.fx.node.Node):
+        meta_type = dest.meta["type"]
+        #consider the device is gpu only, meta info is limited to give clear device type
+        if dest.meta["type"] == torch.device:
+            dest_device = dest
+        else:
+            # Due to the limitation of FX, we can not support to(torch.Tensor) since meta only contains 'type': <class 'torch.Tensor'>
+            raise RuntimeError(f"We currently do not support to({meta_type})")
+    elif isinstance(dest, torch.device):
+        # only device is set, dtype=None
+        if dest_other is None:
+            dest_device = dest
+        # device and dtype are both set
+        else:
+            dest_dtype = dest_other
+            dest_device = dest
+    # only dtype is set
+    else:
+        dest_dtype = dest
 
     new_kwargs = {
         "input": node.kwargs["input"],
         "acc_out_ty": acc_utils.build_raw_tensor_meta(dtype=dest_dtype),
+        "device": dest_device,
     }
 
     with node.graph.inserting_before(node):

Original file line number	Diff line number	Diff line change
`@@ -2371,6 +2371,7 @@ def test_all_acc_ops_registered(self):`
`2371`	`2371`	`acc_ops.interpolate,`
`2372`	`2372`	`acc_ops.logical_and,`
`2373`	`2373`	`acc_ops.logical_not,`
`2374`		`- acc_ops.ne`
	`2374`	`+ acc_ops.ne,`
	`2375`	`+ acc_ops.device,`
`2375`	`2376`	`},`
`2376`	`2377`	`)`