Make mts benchmark profiling easier (#9)

Yinghai Lu · Wei Wei · commit 74187d8ad204 · 2022-06-03T17:54:11.000-07:00
Summary: Pull Request resolved: https://github.com/pytorch/fx2trt/pull/9 By default, now we can just profile the lowered trt module and print its per layer cost in sorted form. Also renamed `inline_cvr_7x_gpu_benchmark.py` to `mts_gpu_benchmark.py` as it become quite generic now. Reviewed By: wushirong Differential Revision: D34690159 fbshipit-source-id: f30c18d2e139d934392fd7e253fd774f36a8ca11
diff --git a/fx/lower.py b/fx/lower.py
@@ -53,7 +53,7 @@
 # >>>     # print_module_and_input will be called right after the fuse passes
 # >>>     lower(module, sample_input)
 
-# Observer for the model after the fuse passes. 
+# Observer for the model after the fuse passes.
 FUSE_PASSES_POST_OBSERVER: Observer[
     Callable[[nn.Module, Input], None]
 ] = Observer("FUSE_PASSES_POST_OBSERVER")
@@ -66,7 +66,7 @@
 # Observer for the TRT split submodules after lowering
 LOWER_SPLIT_POST_OBSERVER: Observer[
     Callable[[str, nn.Module, Input], None]
-] = Observer("LOWER_SPLIT_PRE_OBSERVER")
+] = Observer("LOWER_SPLIT_POST_OBSERVER")
 # ----------------------------------------------------------------------
 
 
diff --git a/fx/tools/engine_layer_visualize.py b/fx/tools/engine_layer_visualize.py
@@ -1,5 +1,3 @@
-# (c) Facebook, Inc. and its affiliates. Confidential and proprietary.
-
 import argparse
 import re
 from typing import NamedTuple, List, Optional, Dict, Any, Tuple
@@ -17,9 +15,6 @@
 
 Usage:
     python fx2trt_oss.fx/tools/engine_layer_visualize.py --log_file aaa --profile_file bbb
-
-Usage(Facebook):
-    buck run //caffe2:trt_engine_layer_visualize -- --log_file aaa --profile_file bbb
 """
 
 
diff --git a/fx/tools/trt_profiler_sorted.py b/fx/tools/trt_profiler_sorted.py
@@ -0,0 +1,17 @@
+import tensorrt as trt
+import operator
+
+
+class SortedTRTProfiler(trt.IProfiler):
+    def __init__(self):
+        super().__init__()
+        self.layers = {}
+
+    def report_layer_time(self, layer_name: str, ms: int) -> None:
+        self.layers[layer_name] = ms
+
+    def print_sorted_profile(self) -> None:
+        for k, v in sorted(self.layers.items(), key=operator.itemgetter(1)):
+            print(f"{k}: {v}ms")
+
+
diff --git a/fx/trt_module.py b/fx/trt_module.py
@@ -198,15 +198,15 @@ def forward(self, *inputs):
 
             return tuple(outputs)
 
-    def enable_profiling(self):
+    def enable_profiling(self, profiler: "trt.IProfiler"=None):
         """
         Enable TensorRT profiling. After calling this function, TensorRT will report
         time spent on each layer in stdout for each forward run.
         """
         self._check_initialized()
 
         if not self.context.profiler:
-            self.context.profiler = trt.Profiler()
+            self.context.profiler = trt.Profiler() if profiler is None else profiler
 
     def disable_profiling(self):
         """