pytorch · frank-wei · Aug 22, 2022 · Aug 22, 2022
diff --git a/notebooks/getting_started_with_fx_path_lower_to_trt.ipynb b/notebooks/getting_started_with_fx_path_lower_to_trt.ipynb
@@ -10,14 +10,14 @@
       "bento/extensions/theme/main.css": true
     },
     "kernelspec": {
-      "display_name": "accelerators",
+      "display_name": "dper3_pytorch (cinder)",
       "language": "python",
-      "name": "bento_kernel_accelerators",
+      "name": "bento_kernel_dper3_pytorch_cinder",
       "metadata": {
-        "kernel_name": "bento_kernel_accelerators",
-        "nightly_builds": true,
+        "kernel_name": "bento_kernel_dper3_pytorch_cinder",
+        "nightly_builds": false,
         "fbpkg_supported": true,
-        "cinder_runtime": false,
+        "cinder_runtime": true,
         "is_prebuilt": true
       }
     },
@@ -32,10 +32,10 @@
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3"
     },
-    "last_server_session_id": "c6f6ab3c-9274-41e7-8592-b1b583442e00",
-    "last_kernel_id": "fcbf3a69-76a4-4730-9b41-bcd0b24729ca",
-    "last_base_url": "https://devgpu005.ftw6.facebook.com:8093/",
-    "last_msg_id": "e28f842c-f32dde25c1b80ef7d423dfee_407",
+    "last_server_session_id": "24a1a10c-29aa-4e2b-a11f-2b5108fc1e58",
+    "last_kernel_id": "5f014373-151c-4ee8-8939-4daab994d202",
+    "last_base_url": "https://devgpu005.ftw6.facebook.com:8091/",
+    "last_msg_id": "687e81e8-4414f32c89cd026dd1ea3fd9_139",
     "outputWidgetContext": {}
   },
   "nbformat": 4,
@@ -58,14 +58,14 @@
     {
       "cell_type": "code",
       "metadata": {
-        "originalKey": "7909785f-b9b4-41dd-82af-c144b879df39",
+        "originalKey": "7db2accc-9fa4-4a1e-8142-d887f2947bcd",
         "showInput": true,
         "customInput": null,
         "collapsed": false,
-        "requestMsgId": "7db2accc-9fa4-4a1e-8142-d887f2947bcd",
+        "requestMsgId": "b5d8efce-0963-4074-bc9d-e8e1a78fd424",
         "customOutput": null,
-        "executionStartTime": 1656395936225,
-        "executionStopTime": 1656395937851
+        "executionStartTime": 1661189891682,
+        "executionStopTime": 1661189891856
       },
       "source": [
         "import typing as t\n",
@@ -74,10 +74,10 @@
         "\n",
         "import torch\n",
         "import torchvision\n",
-        "from torch_tensorrt.fx.lower import lower_to_trt\n",
+        "from torch_tensorrt.fx.lower import compile\n",
         "from torch_tensorrt.fx.utils import LowerPrecision"
       ],
-      "execution_count": 4,
+      "execution_count": 9,
       "outputs": []
     },
     {
@@ -98,16 +98,16 @@
     {
       "cell_type": "code",
       "metadata": {
-        "originalKey": "a4455135-8633-4d2d-bdd3-6435a4a9f4dd",
+        "originalKey": "2835fffa-cc50-479a-9080-c4f7002c0726",
         "showInput": true,
         "customInput": null,
         "code_folding": [],
         "hidden_ranges": [],
         "collapsed": false,
-        "requestMsgId": "2835fffa-cc50-479a-9080-c4f7002c0726",
+        "requestMsgId": "6ea72dbf-dbfe-451e-8613-15f87e34a1a5",
         "customOutput": null,
-        "executionStartTime": 1656398717455,
-        "executionStopTime": 1656398717662
+        "executionStartTime": 1661189260550,
+        "executionStopTime": 1661189262039
       },
       "source": [
         "@dataclass\n",
@@ -159,24 +159,39 @@
         "            f\"Accuracy: {self.accuracy_res} (rtol={self.conf.accuracy_rtol})\"\n",
         "        )"
       ],
-      "execution_count": 22,
-      "outputs": []
+      "execution_count": 2,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "I0822 102740.872 _utils_internal.py:179] NCCL_DEBUG env var is set to None\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "I0822 102740.873 _utils_internal.py:188] NCCL_DEBUG is INFO from /etc/nccl.conf\n"
+          ]
+        }
+      ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
         "originalKey": "3e462cf6-d282-402d-955b-a3ecb400bf0b",
-        "showInput": true,
+        "showInput": false,
         "customInput": null,
         "code_folding": [],
         "hidden_ranges": []
       },
       "source": [
         "Run FX path lowering and benchmark the given model according to the specified benchmark configuration. Prints the benchmark result for each configuration at the end of the run. `benchmark_torch_function` is the actual function that computes the fixed number of iterations of functions runs.\n",
-        "The FX path lowering and TensorRT engine creation is integrated into `low_to_trt()` API which is defined in `fx/lower.py` file.\n",
+        "The FX path lowering and TensorRT engine creation is integrated into `compile()` API which is defined in `fx/lower.py` file.\n",
         "It is good to list it out and show the usage of it. It takes in original module, input and lowering setting, run lowering workflow to turn module into a executable TRT engine \n",
         "```\n",
-        "def lower_to_trt(\n",
+        "def compile(\n",
         "    module: nn.Module,\n",
         "    input: ,\n",
         "    max_batch_size: int = 2048,\n",
@@ -212,22 +227,18 @@
     {
       "cell_type": "code",
       "metadata": {
-        "originalKey": "91333212-7f6d-4bde-a248-44d485e83e5e",
+        "originalKey": "3002935b-b95a-4a08-a57f-f7a35485af5b",
         "showInput": true,
         "customInput": null,
         "code_folding": [],
         "hidden_ranges": [],
         "collapsed": false,
-        "requestMsgId": "3002935b-b95a-4a08-a57f-f7a35485af5b",
+        "requestMsgId": "dc73f2d0-427b-4f71-bec1-b118cc5642d0",
         "customOutput": null,
-        "executionStartTime": 1656397903207,
-        "executionStopTime": 1656397964752
+        "executionStartTime": 1661189697773,
+        "executionStopTime": 1661189753875
       },
       "source": [
-        "test_model = torchvision.models.resnet18(pretrained=True)\n",
-        "input = [torch.rand(128, 3, 224, 224)]   \n",
-        "benchmark(test_model, input, 50, 128)\n",
-        "\n",
         "def benchmark_torch_function(iters: int, f, *args) -> float:\n",
         "    \"\"\"Estimates the average time duration for a single inference call in second\n",
         "\n",
@@ -266,7 +277,7 @@
         "        time = benchmark_torch_function(conf.batch_iter, lambda: module(*input))\n",
         "    elif not conf.jit:\n",
         "        # Run lowering eager mode benchmark\n",
-        "        lowered_module = lower_to_trt(\n",
+        "        lowered_module = compile(\n",
         "            module,\n",
         "            input,\n",
         "            max_batch_size=conf.batch_size,\n",
@@ -279,6 +290,7 @@
         "    result = Result(module=module, input=input, conf=conf, time_sec=time)\n",
         "    return result\n",
         "\n",
+        "\n",
         "@torch.inference_mode()\n",
         "def benchmark(\n",
         "    model,\n",
@@ -315,16 +327,25 @@
         "        ),\n",
         "    ]\n",
         "\n",
-        "    results = [\n",
-        "        run_configuration_benchmark(deepcopy(model), inputs, conf_)\n",
-        "        for conf_ in configurations\n",
-        "    ]\n",
+        "    results = [run_configuration_benchmark(deepcopy(model), inputs, conf_) for conf_ in configurations]\n",
         "\n",
         "    for res in results:\n",
-        "        print(res.format())"
+        "        print(res.format())\n",
+        "\n",
+        "\n",
+        "test_model = torchvision.models.resnet18(pretrained=True)\n",
+        "input = [torch.rand(128, 3, 224, 224)]\n",
+        "benchmark(test_model, input, 50, 128)"
       ],
-      "execution_count": 21,
+      "execution_count": 8,
       "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "I0822 103458.189 manifold.py:1435] URL manifold://torchvision/tree/models/resnet18-f37072fd.pth was already cached in /home/wwei6/.torch/iopath_cache/manifold_cache/tree/models/resnet18-f37072fd.pth\n"
+          ]
+        },
         {
           "output_type": "stream",
           "name": "stdout",
@@ -339,25 +360,60 @@
             "== End benchmark iterations\n=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001) green\n"
           ]
         },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "I0822 103501.297 pass_utils.py:166] == Log pass <function fuse_permute_matmul at 0x7f787a0e08b0> before/after graph to /tmp/tmpe_7p37fq\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "I0822 103501.390 pass_utils.py:166] == Log pass <function fuse_permute_linear at 0x7f787a0e0670> before/after graph to /tmp/tmpg_a347f0\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "I0822 103501.509 lower_pass_manager_builder.py:151] Now lowering submodule _run_on_acc_0\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "I0822 103501.511 lower.py:89] split_name='_run_on_acc_0' self.lower_setting.input_specs=[InputTensorSpec(shape=torch.Size([128, 3, 224, 224]), dtype=torch.float32, device=device(type='cuda', index=0), shape_ranges=[], has_batch_dim=True)]\n"
+          ]
+        },
         {
           "output_type": "stream",
           "name": "stdout",
           "text": [
-            "== Log pass <function fuse_permute_matmul at 0x7fbdfcc9f1f0> before/after graph to /tmp/tmpaayayg72\n== Log pass <function fuse_permute_linear at 0x7fbe36555f70> before/after graph to /tmp/tmpdw_pq71j\n\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float32, 'weight': torch.float32})\nacc_ops.batch_norm: ((), {'input': torch.float32, 'running_mean': torch.float32, 'running_var': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\nacc_ops.relu: ((), {'input': torch.float32})\nacc_ops.max_pool2d: ((), {'input': torch.float32})\nacc_ops.add: ((), {'input': torch.float32, 'other': torch.float32})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float32})\nacc_ops.flatten: ((), {'input': torch.float32})\nacc_ops.linear: ((), {'input': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n"
+            "\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float32, 'weight': torch.float32})\nacc_ops.batch_norm: ((), {'input': torch.float32, 'running_mean': torch.float32, 'running_var': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\nacc_ops.relu: ((), {'input': torch.float32})\nacc_ops.max_pool2d: ((), {'input': torch.float32})\nacc_ops.add: ((), {'input': torch.float32, 'other': torch.float32})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float32})\nacc_ops.flatten: ((), {'input': torch.float32})\nacc_ops.linear: ((), {'input': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n"
           ]
         },
         {
           "output_type": "stream",
           "name": "stderr",
           "text": [
-            "I0627 233146.650 fx2trt.py:190] Run Module elapsed time: 0:00:00.244369\n"
+            "I0822 103503.964 fx2trt.py:204] Run Module elapsed time: 0:00:00.435984\n"
           ]
         },
         {
           "output_type": "stream",
           "name": "stderr",
           "text": [
-            "I0627 233206.570 fx2trt.py:241] Build TRT engine elapsed time: 0:00:19.918630\n"
+            "I0822 103520.647 fx2trt.py:258] Build TRT engine elapsed time: 0:00:16.681226\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "I0822 103520.658 lower_pass_manager_builder.py:168] Lowering submodule _run_on_acc_0 elapsed time 0:00:19.147071\n"
           ]
         },
         {
@@ -374,25 +430,60 @@
             "== End benchmark iterations\n=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01) green\n"
           ]
         },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "I0822 103523.067 pass_utils.py:166] == Log pass <function fuse_permute_matmul at 0x7f787a0e08b0> before/after graph to /tmp/tmpgphlicna\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "I0822 103523.106 pass_utils.py:166] == Log pass <function fuse_permute_linear at 0x7f787a0e0670> before/after graph to /tmp/tmpy9cumddi\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "I0822 103523.173 lower_pass_manager_builder.py:151] Now lowering submodule _run_on_acc_0\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "I0822 103523.174 lower.py:89] split_name='_run_on_acc_0' self.lower_setting.input_specs=[InputTensorSpec(shape=torch.Size([128, 3, 224, 224]), dtype=torch.float16, device=device(type='cuda', index=0), shape_ranges=[], has_batch_dim=True)]\n"
+          ]
+        },
         {
           "output_type": "stream",
           "name": "stdout",
           "text": [
-            "== Log pass <function fuse_permute_matmul at 0x7fbdfcc9f1f0> before/after graph to /tmp/tmpnoeblgd5\n== Log pass <function fuse_permute_linear at 0x7fbe36555f70> before/after graph to /tmp/tmpyb1egsof\n\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float16, 'weight': torch.float16})\nacc_ops.batch_norm: ((), {'input': torch.float16, 'running_mean': torch.float16, 'running_var': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\nacc_ops.relu: ((), {'input': torch.float16})\nacc_ops.max_pool2d: ((), {'input': torch.float16})\nacc_ops.add: ((), {'input': torch.float16, 'other': torch.float16})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float16})\nacc_ops.flatten: ((), {'input': torch.float16})\nacc_ops.linear: ((), {'input': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n"
+            "\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float16, 'weight': torch.float16})\nacc_ops.batch_norm: ((), {'input': torch.float16, 'running_mean': torch.float16, 'running_var': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\nacc_ops.relu: ((), {'input': torch.float16})\nacc_ops.max_pool2d: ((), {'input': torch.float16})\nacc_ops.add: ((), {'input': torch.float16, 'other': torch.float16})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float16})\nacc_ops.flatten: ((), {'input': torch.float16})\nacc_ops.linear: ((), {'input': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "I0822 103523.466 fx2trt.py:204] Run Module elapsed time: 0:00:00.288043\n"
           ]
         },
         {
           "output_type": "stream",
           "name": "stderr",
           "text": [
-            "I0627 233208.996 fx2trt.py:190] Run Module elapsed time: 0:00:00.217076\n"
+            "I0822 103553.687 fx2trt.py:258] Build TRT engine elapsed time: 0:00:30.220316\n"
           ]
         },
         {
           "output_type": "stream",
           "name": "stderr",
           "text": [
-            "I0627 233244.147 fx2trt.py:241] Build TRT engine elapsed time: 0:00:35.150950\n"
+            "I0822 103553.698 lower_pass_manager_builder.py:168] Lowering submodule _run_on_acc_0 elapsed time 0:00:30.523791\n"
           ]
         },
         {
@@ -406,7 +497,7 @@
           "output_type": "stream",
           "name": "stdout",
           "text": [
-            "== End benchmark iterations\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='CUDA Eager', trt=False, jit=False, fp16=False, accuracy_rtol=-1)\nBS: 128, Time per iter: 15.00ms, QPS: 8530.72, Accuracy: None (rtol=-1)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001)\nBS: 128, Time per iter: 7.95ms, QPS: 16098.45, Accuracy: None (rtol=0.001)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01)\nBS: 128, Time per iter: 4.36ms, QPS: 29365.31, Accuracy: None (rtol=0.01)\n"
+            "== End benchmark iterations\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='CUDA Eager', trt=False, jit=False, fp16=False, accuracy_rtol=-1)\nBS: 128, Time per iter: 14.66ms, QPS: 8732.53, Accuracy: None (rtol=-1)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001)\nBS: 128, Time per iter: 7.27ms, QPS: 17595.70, Accuracy: None (rtol=0.001)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01)\nBS: 128, Time per iter: 4.49ms, QPS: 28480.34, Accuracy: None (rtol=0.01)\n"
           ]
         }
       ]