Added kwarg to refit

cehongwang · cehongwang · commit dc50ee017408 · 2024-07-12T14:53:36.000-07:00
diff --git a/docs/_downloads/7e3a125a2d4ba8274a41b46f5e0723fa/refit_engine_example.py b/docs/_downloads/7e3a125a2d4ba8274a41b46f5e0723fa/refit_engine_example.py
@@ -0,0 +1,98 @@
+"""
+.. _refit_engine_example:
+
+Refit  TenorRT Graph Module with Torch-TensorRT
+===================================================================
+
+We are going to demonstrate how a compiled TensorRT Graph Module can be refitted with updated weights.
+
+In many cases, we frequently update the weights of models, such as applying various LoRA to Stable Diffusion or constant A/B testing of AI products.
+That poses challenges for TensorRT inference optimizations, as compiling the TensorRT engines takes significant time, making repetitive compilation highly inefficient.
+Torch-TensorRT supports refitting TensorRT graph modules without re-compiling the engine, considerably accelerating the workflow.
+
+In this tutorial, we are going to walk through
+1. Compiling a PyTorch model to a TensorRT Graph Module
+2. Save and load a graph module
+3. Refit the graph module
+"""
+
+# %%
+# Standard Workflow
+# -----------------------------
+
+# %%
+# Imports and model definition
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+import numpy as np
+import torch
+import torch_tensorrt as torch_trt
+import torchvision.models as models
+from torch_tensorrt.dynamo import refit_module_weights
+
+np.random.seed(0)
+torch.manual_seed(0)
+inputs = [torch.rand((1, 3, 224, 224)).to("cuda")]
+
+
+# %%
+# Compile the module for the first time and save it.
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+model = models.resnet18(pretrained=False).eval().to("cuda")
+exp_program = torch.export.export(model, tuple(inputs))
+enabled_precisions = {torch.float}
+debug = False
+workspace_size = 20 << 30
+min_block_size = 0
+use_python_runtime = False
+torch_executed_ops = {}
+trt_gm = torch_trt.dynamo.compile(
+    exp_program,
+    tuple(inputs),
+    use_python_runtime=use_python_runtime,
+    enabled_precisions=enabled_precisions,
+    debug=debug,
+    min_block_size=min_block_size,
+    torch_executed_ops=torch_executed_ops,
+    make_refitable=True,
+)  # Output is a torch.fx.GraphModule
+
+# Save the graph module as an exported program
+# This is only supported when use_python_runtime = False
+torch_trt.save(trt_gm, "./compiled.ep", inputs=inputs)
+
+
+# %%
+# Refit the module with update model weights
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+# Create and compile the updated model
+model2 = models.resnet18(pretrained=True).eval().to("cuda")
+exp_program2 = torch.export.export(model2, tuple(inputs))
+
+
+compiled_trt_ep = torch_trt.load("./compiled.ep")
+
+# This returns a new module with updated weights
+new_trt_gm = refit_module_weights(
+    compiled_module=compiled_trt_ep,
+    new_weight_module=exp_program2,
+    arg_inputs=inputs,
+)
+
+# Check the output
+expected_outputs, refitted_outputs = exp_program2.module()(*inputs), new_trt_gm(*inputs)
+for expected_output, refitted_output in zip(expected_outputs, refitted_outputs):
+    assert torch.allclose(
+        expected_output, refitted_output, 1e-2, 1e-2
+    ), "Refit Result is not correct. Refit failed"
+
+print("Refit successfully!")
+
+# %%
+# Alternative Workflow using Python Runtime
+# -----------------------------
+
+# Currently python runtime does not support engine serialization. So the refitting will be done in the same runtime.
+# This usecase is more useful when you need to switch different weights in the same runtime, such as using Stable Diffusion.
diff --git a/examples/dynamo/refit_engine_example.py b/examples/dynamo/refit_engine_example.py
@@ -78,7 +78,7 @@
 new_trt_gm = refit_module_weights(
     compiled_module=compiled_trt_ep,
     new_weight_module=exp_program2,
-    inputs=inputs,
+    arg_inputs=inputs,
 )
 
 # Check the output
diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py
@@ -3,7 +3,7 @@
 import collections.abc
 import copy
 import logging
-from typing import Any, Sequence, Tuple
+from typing import Any, Optional, Sequence, Tuple
 
 import numpy as np
 import tensorrt as trt
@@ -36,7 +36,6 @@
 from torch_tensorrt.dynamo.utils import (
     check_output,
     get_torch_inputs,
-    prepare_inputs,
     set_log_level,
     to_torch_device,
     to_torch_tensorrt_device,
@@ -146,7 +145,8 @@ def _refit_single_trt_engine_with_gm(
 def refit_module_weights(
     compiled_module: torch.fx.GraphModule | ExportedProgram,
     new_weight_module: ExportedProgram,
-    inputs: Tuple[Any, ...],
+    arg_inputs: Optional[Tuple[Any, ...]] = None,
+    kwarg_inputs: Optional[dict[str, Any]] = None,
     verify_output: bool = False,
 ) -> torch.fx.GraphModule:
     """
@@ -208,27 +208,29 @@ def refit_module_weights(
     if settings.debug:
         set_log_level(logger.parent, logging.DEBUG)
 
-    if not isinstance(inputs, collections.abc.Sequence):
-        inputs = [inputs]
-
-    # Prepare torch_trt inputs
-    inputs = prepare_inputs(inputs)
     device = to_torch_tensorrt_device(settings.device)
-    torch_inputs = get_torch_inputs(inputs, device)
+    if arg_inputs:
+        if not isinstance(arg_inputs, collections.abc.Sequence):
+            # Prepare torch_trt inputs
+            arg_inputs = [arg_inputs]
+        torch_inputs = get_torch_inputs(arg_inputs, device)
+
+    if kwarg_inputs:
+        torch_kwarg_inputs = get_torch_inputs(kwarg_inputs, device)
     runtime = trt.Runtime(TRT_LOGGER)
     if not isinstance(new_weight_module, ExportedProgram):
         raise AssertionError(
             f"Input graph should be an ExportedProgram but got type {type(new_weight_module)}"
         )
-    new_weight_module = pre_export_lowering(new_weight_module, torch_inputs)
+    new_weight_module = pre_export_lowering(new_weight_module)
     new_weight_module = new_weight_module.run_decompositions(
         get_decompositions(settings.enable_experimental_decompositions)
     )
     new_gm = new_weight_module.module()
     logger.debug("Input graph: " + str(new_gm.graph))
     # Apply lowering on the graph module
 
-    new_gm = post_lowering(new_gm, torch_inputs)
+    new_gm = post_lowering(new_gm)
 
     logger.info("Compilation Settings: %s\n", settings)
 
@@ -354,11 +356,12 @@ def refit_module_weights(
             refitted_engine = torch.classes.tensorrt.Engine(tuple(new_engine_info))
             setattr(compiled_module, f"{name}_engine", refitted_engine)
 
-    if verify_output:
+    if verify_output and arg_inputs is not None:
         if check_output(
             new_module=new_gm,
             refitted_module=compiled_module,
-            inputs=torch_inputs,
+            arg_inputs=torch_inputs,
+            kwarg_inputs=torch_kwarg_inputs,
         ):
             logger.info("Refitting Succeed!")
         else:
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
@@ -397,9 +397,12 @@ def function_wrapper(*args: Any, **kwargs: Any) -> Any:
 def check_output(
     new_module: torch.fx.GraphModule,
     refitted_module: torch.fx.GraphModule,
-    inputs: tuple[Any, ...],
+    arg_inputs: Any,
+    kwarg_inputs: Any = None,
 ) -> bool:
-    old_outputs, new_outputs = refitted_module(*inputs), new_module(*inputs)
+    old_outputs, new_outputs = refitted_module(*arg_inputs), new_module(
+        *arg_inputs, **kwarg_inputs
+    )
     for old_output, new_output in zip(old_outputs, new_outputs):
         if isinstance(old_output, torch.Tensor) and isinstance(
             new_outputs, torch.Tensor
diff --git a/tests/py/dynamo/models/test_dtype_support.py b/tests/py/dynamo/models/test_dtype_support.py
@@ -36,7 +36,7 @@ def forward(self, x):
         exp_mod = torch.export.export(mod, (in_tensor,))
         trt_mod = torch_tensorrt.dynamo.compile(
             exp_mod,
-            arg_inputs=[in_tensor],
+            inputs=[in_tensor],
             pass_through_build_failures=True,
             truncate_double=True,
             min_block_size=1,
@@ -74,7 +74,7 @@ def forward(self, x):
         exp_mod = torch.export.export(mod, (in_tensor,))
         trt_mod = torch_tensorrt.dynamo.compile(
             exp_mod,
-            arg_inputs=[in_tensor],
+            inputs=[in_tensor],
             pass_through_build_failures=True,
             truncate_double=True,
             min_block_size=1,
@@ -118,7 +118,7 @@ def forward(self, x):
         exp_mod = torch.export.export(mod, (in_tensor,))
         trt_mod = torch_tensorrt.dynamo.compile(
             exp_mod,
-            arg_inputs=[in_tensor],
+            inputs=[in_tensor],
             pass_through_build_failures=True,
             truncate_double=False,
             min_block_size=1,
@@ -157,7 +157,7 @@ def forward(self, x):
         exp_mod = torch.export.export(mod, (in_tensor,))
         trt_mod = torch_tensorrt.dynamo.compile(
             exp_mod,
-            arg_inputs=[in_tensor],
+            inputs=[in_tensor],
             pass_through_build_failures=True,
             truncate_double=False,
             min_block_size=1,
@@ -201,7 +201,7 @@ def forward(self, x):
         exp_mod = torch.export.export(mod, (in_tensor,))
         trt_mod = torch_tensorrt.dynamo.compile(
             exp_mod,
-            arg_inputs=[in_tensor],
+            inputs=[in_tensor],
             pass_through_build_failures=True,
             enabled_precisions={torch.float, torch.bfloat16, torch.half},
             min_block_size=1,
@@ -239,7 +239,7 @@ def forward(self, x):
         exp_mod = torch.export.export(mod, (in_tensor,))
         trt_mod = torch_tensorrt.dynamo.compile(
             exp_mod,
-            arg_inputs=[in_tensor],
+            inputs=[in_tensor],
             pass_through_build_failures=True,
             enabled_precisions={torch.float, torch.bfloat16, torch.half},
             min_block_size=1,
diff --git a/tests/py/dynamo/models/test_model_refit.py b/tests/py/dynamo/models/test_model_refit.py
@@ -108,7 +108,7 @@ def test_refit_one_engine():
     new_trt_gm = refit_module_weights(
         compiled_module=trt_gm,
         new_weight_module=exp_program2,
-        inputs=inputs,
+        arg_inputs=inputs,
     )
 
     # Check the output
@@ -154,7 +154,7 @@ def test_refit_one_engine_bert():
     new_trt_gm = refit_module_weights(
         compiled_module=trt_gm,
         new_weight_module=exp_program2,
-        inputs=inputs,
+        arg_inputs=inputs,
     )
 
     # Check the output
@@ -203,7 +203,7 @@ def test_refit_one_engine_inline_runtime():
     new_trt_gm = refit_module_weights(
         compiled_module=trt_gm,
         new_weight_module=exp_program2,
-        inputs=inputs,
+        arg_inputs=inputs,
     )
 
     # Check the output
@@ -247,7 +247,7 @@ def test_refit_one_engine_python_runtime():
     new_trt_gm = refit_module_weights(
         compiled_module=trt_gm,
         new_weight_module=exp_program2,
-        inputs=inputs,
+        arg_inputs=inputs,
     )
 
     # Check the output
@@ -313,7 +313,7 @@ def forward(self, x):
     new_trt_gm = refit_module_weights(
         compiled_module=trt_gm,
         new_weight_module=exp_program2,
-        inputs=inputs,
+        arg_inputs=inputs,
     )
 
     # Check the output
diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py
@@ -222,7 +222,7 @@ def calibrate_loop(model):
             exp_program = torch.export.export(model, (input_tensor,))
             trt_model = torchtrt.dynamo.compile(
                 exp_program,
-                arg_inputs=[input_tensor],
+                inputs=[input_tensor],
                 enabled_precisions={torch.float8_e4m3fn},
                 min_block_size=1,
                 debug=True,

Original file line number	Diff line number	Diff line change
`@@ -78,7 +78,7 @@`
`78`	`78`	`new_trt_gm = refit_module_weights(`
`79`	`79`	`compiled_module=compiled_trt_ep,`
`80`	`80`	`new_weight_module=exp_program2,`
`81`		`- inputs=inputs,`
	`81`	`+ arg_inputs=inputs,`
`82`	`82`	`)`
`83`	`83`
`84`	`84`	`# Check the output`