Skip to content

update fx notebook #1297

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 22, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 137 additions & 46 deletions notebooks/getting_started_with_fx_path_lower_to_trt.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@
"bento/extensions/theme/main.css": true
},
"kernelspec": {
"display_name": "accelerators",
"display_name": "dper3_pytorch (cinder)",
"language": "python",
"name": "bento_kernel_accelerators",
"name": "bento_kernel_dper3_pytorch_cinder",
"metadata": {
"kernel_name": "bento_kernel_accelerators",
"nightly_builds": true,
"kernel_name": "bento_kernel_dper3_pytorch_cinder",
"nightly_builds": false,
"fbpkg_supported": true,
"cinder_runtime": false,
"cinder_runtime": true,
"is_prebuilt": true
}
},
Expand All @@ -32,10 +32,10 @@
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3"
},
"last_server_session_id": "c6f6ab3c-9274-41e7-8592-b1b583442e00",
"last_kernel_id": "fcbf3a69-76a4-4730-9b41-bcd0b24729ca",
"last_base_url": "https://devgpu005.ftw6.facebook.com:8093/",
"last_msg_id": "e28f842c-f32dde25c1b80ef7d423dfee_407",
"last_server_session_id": "24a1a10c-29aa-4e2b-a11f-2b5108fc1e58",
"last_kernel_id": "5f014373-151c-4ee8-8939-4daab994d202",
"last_base_url": "https://devgpu005.ftw6.facebook.com:8091/",
"last_msg_id": "687e81e8-4414f32c89cd026dd1ea3fd9_139",
"outputWidgetContext": {}
},
"nbformat": 4,
Expand All @@ -58,14 +58,14 @@
{
"cell_type": "code",
"metadata": {
"originalKey": "7909785f-b9b4-41dd-82af-c144b879df39",
"originalKey": "7db2accc-9fa4-4a1e-8142-d887f2947bcd",
"showInput": true,
"customInput": null,
"collapsed": false,
"requestMsgId": "7db2accc-9fa4-4a1e-8142-d887f2947bcd",
"requestMsgId": "b5d8efce-0963-4074-bc9d-e8e1a78fd424",
"customOutput": null,
"executionStartTime": 1656395936225,
"executionStopTime": 1656395937851
"executionStartTime": 1661189891682,
"executionStopTime": 1661189891856
},
"source": [
"import typing as t\n",
Expand All @@ -74,10 +74,10 @@
"\n",
"import torch\n",
"import torchvision\n",
"from torch_tensorrt.fx.lower import lower_to_trt\n",
"from torch_tensorrt.fx.lower import compile\n",
"from torch_tensorrt.fx.utils import LowerPrecision"
],
"execution_count": 4,
"execution_count": 9,
"outputs": []
},
{
Expand All @@ -98,16 +98,16 @@
{
"cell_type": "code",
"metadata": {
"originalKey": "a4455135-8633-4d2d-bdd3-6435a4a9f4dd",
"originalKey": "2835fffa-cc50-479a-9080-c4f7002c0726",
"showInput": true,
"customInput": null,
"code_folding": [],
"hidden_ranges": [],
"collapsed": false,
"requestMsgId": "2835fffa-cc50-479a-9080-c4f7002c0726",
"requestMsgId": "6ea72dbf-dbfe-451e-8613-15f87e34a1a5",
"customOutput": null,
"executionStartTime": 1656398717455,
"executionStopTime": 1656398717662
"executionStartTime": 1661189260550,
"executionStopTime": 1661189262039
},
"source": [
"@dataclass\n",
Expand Down Expand Up @@ -159,24 +159,39 @@
" f\"Accuracy: {self.accuracy_res} (rtol={self.conf.accuracy_rtol})\"\n",
" )"
],
"execution_count": 22,
"outputs": []
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"I0822 102740.872 _utils_internal.py:179] NCCL_DEBUG env var is set to None\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"I0822 102740.873 _utils_internal.py:188] NCCL_DEBUG is INFO from /etc/nccl.conf\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"originalKey": "3e462cf6-d282-402d-955b-a3ecb400bf0b",
"showInput": true,
"showInput": false,
"customInput": null,
"code_folding": [],
"hidden_ranges": []
},
"source": [
"Run FX path lowering and benchmark the given model according to the specified benchmark configuration. Prints the benchmark result for each configuration at the end of the run. `benchmark_torch_function` is the actual function that computes the fixed number of iterations of functions runs.\n",
"The FX path lowering and TensorRT engine creation is integrated into `low_to_trt()` API which is defined in `fx/lower.py` file.\n",
"The FX path lowering and TensorRT engine creation is integrated into `compile()` API which is defined in `fx/lower.py` file.\n",
"It is good to list it out and show the usage of it. It takes in original module, input and lowering setting, run lowering workflow to turn module into a executable TRT engine \n",
"```\n",
"def lower_to_trt(\n",
"def compile(\n",
" module: nn.Module,\n",
" input: ,\n",
" max_batch_size: int = 2048,\n",
Expand Down Expand Up @@ -212,22 +227,18 @@
{
"cell_type": "code",
"metadata": {
"originalKey": "91333212-7f6d-4bde-a248-44d485e83e5e",
"originalKey": "3002935b-b95a-4a08-a57f-f7a35485af5b",
"showInput": true,
"customInput": null,
"code_folding": [],
"hidden_ranges": [],
"collapsed": false,
"requestMsgId": "3002935b-b95a-4a08-a57f-f7a35485af5b",
"requestMsgId": "dc73f2d0-427b-4f71-bec1-b118cc5642d0",
"customOutput": null,
"executionStartTime": 1656397903207,
"executionStopTime": 1656397964752
"executionStartTime": 1661189697773,
"executionStopTime": 1661189753875
},
"source": [
"test_model = torchvision.models.resnet18(pretrained=True)\n",
"input = [torch.rand(128, 3, 224, 224)] \n",
"benchmark(test_model, input, 50, 128)\n",
"\n",
"def benchmark_torch_function(iters: int, f, *args) -> float:\n",
" \"\"\"Estimates the average time duration for a single inference call in second\n",
"\n",
Expand Down Expand Up @@ -266,7 +277,7 @@
" time = benchmark_torch_function(conf.batch_iter, lambda: module(*input))\n",
" elif not conf.jit:\n",
" # Run lowering eager mode benchmark\n",
" lowered_module = lower_to_trt(\n",
" lowered_module = compile(\n",
" module,\n",
" input,\n",
" max_batch_size=conf.batch_size,\n",
Expand All @@ -279,6 +290,7 @@
" result = Result(module=module, input=input, conf=conf, time_sec=time)\n",
" return result\n",
"\n",
"\n",
"@torch.inference_mode()\n",
"def benchmark(\n",
" model,\n",
Expand Down Expand Up @@ -315,16 +327,25 @@
" ),\n",
" ]\n",
"\n",
" results = [\n",
" run_configuration_benchmark(deepcopy(model), inputs, conf_)\n",
" for conf_ in configurations\n",
" ]\n",
" results = [run_configuration_benchmark(deepcopy(model), inputs, conf_) for conf_ in configurations]\n",
"\n",
" for res in results:\n",
" print(res.format())"
" print(res.format())\n",
"\n",
"\n",
"test_model = torchvision.models.resnet18(pretrained=True)\n",
"input = [torch.rand(128, 3, 224, 224)]\n",
"benchmark(test_model, input, 50, 128)"
],
"execution_count": 21,
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"I0822 103458.189 manifold.py:1435] URL manifold://torchvision/tree/models/resnet18-f37072fd.pth was already cached in /home/wwei6/.torch/iopath_cache/manifold_cache/tree/models/resnet18-f37072fd.pth\n"
]
},
{
"output_type": "stream",
"name": "stdout",
Expand All @@ -339,25 +360,60 @@
"== End benchmark iterations\n=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001) green\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"I0822 103501.297 pass_utils.py:166] == Log pass <function fuse_permute_matmul at 0x7f787a0e08b0> before/after graph to /tmp/tmpe_7p37fq\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"I0822 103501.390 pass_utils.py:166] == Log pass <function fuse_permute_linear at 0x7f787a0e0670> before/after graph to /tmp/tmpg_a347f0\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"I0822 103501.509 lower_pass_manager_builder.py:151] Now lowering submodule _run_on_acc_0\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"I0822 103501.511 lower.py:89] split_name='_run_on_acc_0' self.lower_setting.input_specs=[InputTensorSpec(shape=torch.Size([128, 3, 224, 224]), dtype=torch.float32, device=device(type='cuda', index=0), shape_ranges=[], has_batch_dim=True)]\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"== Log pass <function fuse_permute_matmul at 0x7fbdfcc9f1f0> before/after graph to /tmp/tmpaayayg72\n== Log pass <function fuse_permute_linear at 0x7fbe36555f70> before/after graph to /tmp/tmpdw_pq71j\n\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float32, 'weight': torch.float32})\nacc_ops.batch_norm: ((), {'input': torch.float32, 'running_mean': torch.float32, 'running_var': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\nacc_ops.relu: ((), {'input': torch.float32})\nacc_ops.max_pool2d: ((), {'input': torch.float32})\nacc_ops.add: ((), {'input': torch.float32, 'other': torch.float32})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float32})\nacc_ops.flatten: ((), {'input': torch.float32})\nacc_ops.linear: ((), {'input': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n"
"\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float32, 'weight': torch.float32})\nacc_ops.batch_norm: ((), {'input': torch.float32, 'running_mean': torch.float32, 'running_var': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\nacc_ops.relu: ((), {'input': torch.float32})\nacc_ops.max_pool2d: ((), {'input': torch.float32})\nacc_ops.add: ((), {'input': torch.float32, 'other': torch.float32})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float32})\nacc_ops.flatten: ((), {'input': torch.float32})\nacc_ops.linear: ((), {'input': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"I0627 233146.650 fx2trt.py:190] Run Module elapsed time: 0:00:00.244369\n"
"I0822 103503.964 fx2trt.py:204] Run Module elapsed time: 0:00:00.435984\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"I0627 233206.570 fx2trt.py:241] Build TRT engine elapsed time: 0:00:19.918630\n"
"I0822 103520.647 fx2trt.py:258] Build TRT engine elapsed time: 0:00:16.681226\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"I0822 103520.658 lower_pass_manager_builder.py:168] Lowering submodule _run_on_acc_0 elapsed time 0:00:19.147071\n"
]
},
{
Expand All @@ -374,25 +430,60 @@
"== End benchmark iterations\n=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01) green\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"I0822 103523.067 pass_utils.py:166] == Log pass <function fuse_permute_matmul at 0x7f787a0e08b0> before/after graph to /tmp/tmpgphlicna\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"I0822 103523.106 pass_utils.py:166] == Log pass <function fuse_permute_linear at 0x7f787a0e0670> before/after graph to /tmp/tmpy9cumddi\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"I0822 103523.173 lower_pass_manager_builder.py:151] Now lowering submodule _run_on_acc_0\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"I0822 103523.174 lower.py:89] split_name='_run_on_acc_0' self.lower_setting.input_specs=[InputTensorSpec(shape=torch.Size([128, 3, 224, 224]), dtype=torch.float16, device=device(type='cuda', index=0), shape_ranges=[], has_batch_dim=True)]\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"== Log pass <function fuse_permute_matmul at 0x7fbdfcc9f1f0> before/after graph to /tmp/tmpnoeblgd5\n== Log pass <function fuse_permute_linear at 0x7fbe36555f70> before/after graph to /tmp/tmpyb1egsof\n\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float16, 'weight': torch.float16})\nacc_ops.batch_norm: ((), {'input': torch.float16, 'running_mean': torch.float16, 'running_var': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\nacc_ops.relu: ((), {'input': torch.float16})\nacc_ops.max_pool2d: ((), {'input': torch.float16})\nacc_ops.add: ((), {'input': torch.float16, 'other': torch.float16})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float16})\nacc_ops.flatten: ((), {'input': torch.float16})\nacc_ops.linear: ((), {'input': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n"
"\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float16, 'weight': torch.float16})\nacc_ops.batch_norm: ((), {'input': torch.float16, 'running_mean': torch.float16, 'running_var': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\nacc_ops.relu: ((), {'input': torch.float16})\nacc_ops.max_pool2d: ((), {'input': torch.float16})\nacc_ops.add: ((), {'input': torch.float16, 'other': torch.float16})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float16})\nacc_ops.flatten: ((), {'input': torch.float16})\nacc_ops.linear: ((), {'input': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"I0822 103523.466 fx2trt.py:204] Run Module elapsed time: 0:00:00.288043\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"I0627 233208.996 fx2trt.py:190] Run Module elapsed time: 0:00:00.217076\n"
"I0822 103553.687 fx2trt.py:258] Build TRT engine elapsed time: 0:00:30.220316\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"I0627 233244.147 fx2trt.py:241] Build TRT engine elapsed time: 0:00:35.150950\n"
"I0822 103553.698 lower_pass_manager_builder.py:168] Lowering submodule _run_on_acc_0 elapsed time 0:00:30.523791\n"
]
},
{
Expand All @@ -406,7 +497,7 @@
"output_type": "stream",
"name": "stdout",
"text": [
"== End benchmark iterations\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='CUDA Eager', trt=False, jit=False, fp16=False, accuracy_rtol=-1)\nBS: 128, Time per iter: 15.00ms, QPS: 8530.72, Accuracy: None (rtol=-1)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001)\nBS: 128, Time per iter: 7.95ms, QPS: 16098.45, Accuracy: None (rtol=0.001)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01)\nBS: 128, Time per iter: 4.36ms, QPS: 29365.31, Accuracy: None (rtol=0.01)\n"
"== End benchmark iterations\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='CUDA Eager', trt=False, jit=False, fp16=False, accuracy_rtol=-1)\nBS: 128, Time per iter: 14.66ms, QPS: 8732.53, Accuracy: None (rtol=-1)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001)\nBS: 128, Time per iter: 7.27ms, QPS: 17595.70, Accuracy: None (rtol=0.001)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01)\nBS: 128, Time per iter: 4.49ms, QPS: 28480.34, Accuracy: None (rtol=0.01)\n"
]
}
]
Expand Down