Print shape info in trt profiler (#12)

Yinghai Lu · Wei Wei · commit 3047783e4e9f · 2022-06-03T17:54:11.000-07:00
Summary: Pull Request resolved: https://github.com/pytorch/fx2trt/pull/12 Now we can provide shape info for per layer profile, sorted, which will be quite convenient in terms of flushing out lower hanging fruit. Note that in order to get shape info, we need to turn the profile verbose model to true, which triggers `trt.ProfilingVerbosity.DETAILED` mode. Reviewed By: jasonjk-park, 842974287 Differential Revision: D34712362 fbshipit-source-id: 82b94ca939a54ff0e1340789da80449915fd0b0e
diff --git a/fx/lower.py b/fx/lower.py
@@ -9,6 +9,7 @@
 import torch
 import torch.fx as fx
 import torch.nn as nn
+from fx2trt_oss.fx.observer import Observer
 from fx2trt_oss.tracer.acc_tracer import acc_ops
 from torch.fx.experimental.const_fold import split_const_subgraphs
 from torch.fx.passes.splitter_base import SplitResult
@@ -34,7 +35,6 @@
 from .trt_module import (
     TRTModule,
 )
-from fx2trt_oss.fx.observer import Observer
 
 
 logger = logging.getLogger(__name__)
@@ -182,6 +182,8 @@ class LowerSetting:
     modules will not be traced into.
 
     cuda_graph_batch_size (int): Cuda graph batch size, default to be -1.
+
+    verbose_profile (bool): verbosity of profiler, default to False
     """
     max_batch_size: int = 2048
     input_specs: List[InputTensorSpec] = dc.field(default_factory=list)
@@ -200,6 +202,7 @@ class LowerSetting:
     ast_rewriter_allow_list: Optional[Set[Type[nn.Module]]] = None
     leaf_module_list: Optional[Set[Type[nn.Module]]] = None
     cuda_graph_batch_size: int = -1
+    verbose_profile: bool = False
 
 
 def run_const_fold(traced_mod: torch.fx.GraphModule) -> torch.fx.GraphModule:
@@ -283,6 +286,9 @@ def __call__(self, mod, input, split_name) -> TRTInterpreterResult:
             strict_type_constraints=self.lower_setting.strict_type_constraints,
             algorithm_selector=algo_selector,
             timing_cache=cache_data,
+            profiling_verbosity=trt.ProfilingVerbosity.DETAILED
+            if self.lower_setting.verbose_profile
+            else trt.ProfilingVerbosity.LAYER_NAMES_ONLY,
         )
 
         # Update timing cache file if needed
diff --git a/fx/tools/trt_profiler_sorted.py b/fx/tools/trt_profiler_sorted.py
@@ -1,5 +1,9 @@
 import tensorrt as trt
 import operator
+from typing import Optional, Mapping, List
+import torch
+import json
+from fx2trt_oss.fx import TRTModule
 
 
 class SortedTRTProfiler(trt.IProfiler):
@@ -10,8 +14,32 @@ def __init__(self):
     def report_layer_time(self, layer_name: str, ms: int) -> None:
         self.layers[layer_name] = ms
 
-    def print_sorted_profile(self) -> None:
+    def print_sorted_profile(self, additional_info: Optional[Mapping[str, str]]) -> None:
+        additional_info = {} if additional_info is None else additional_info
         for k, v in sorted(self.layers.items(), key=operator.itemgetter(1)):
-            print(f"{k}: {v}ms")
+            additional_str = additional_info.get(k, "")
+            print(f"{k} {additional_str}: {v}ms")
 
 
+def profile_trt_module(
+    name: str, trt_mod: TRTModule, mod_input: List[torch.Tensor]
+) -> None:
+    """
+    Provide per layer timing and shape info
+    """
+    layer_info = json.loads(trt_mod.get_layer_info())  # pyre-ignore[29]
+    shape_map = {}
+    for layer in layer_info["Layers"]:
+        name = layer["Name"]
+        input_str = ", ".join(
+            [str(x.get("Dimensions", "[]")) for x in layer.get("Inputs", [])]
+        )
+        output_str = ", ".join(
+            [str(x.get("Dimensions", "[]")) for x in layer.get("Outputs", [])]
+        )
+        shape_map[name] = f"({input_str}) -> ({output_str})"
+
+    trt_mod.enable_profiling(profiler=SortedTRTProfiler())  # pyre-ignore[29]
+    _ = trt_mod(*mod_input)
+    trt_mod.context.profiler.print_sorted_profile(shape_map)  # pyre-ignore[16]
+    trt_mod.disable_profiling()  # pyre-ignore[29]
diff --git a/fx/trt_module.py b/fx/trt_module.py
@@ -217,3 +217,10 @@ def disable_profiling(self):
         torch.cuda.synchronize()
         del self.context
         self.context = self.engine.create_execution_context()
+
+    def get_layer_info(self) -> str:
+        """
+        Get layer info of the engine. Only support for TRT > 8.2. 
+        """
+        inspector = self.engine.create_engine_inspector()
+        return inspector.get_engine_information(trt.LayerInformationFormat.JSON)