force using slow refit, add unit tests

zewenli98 · zewenli98 · commit 6533d5c49e68 · 2024-08-27T19:52:12.000-07:00
diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py
@@ -10,7 +10,6 @@
 
 np.random.seed(0)
 torch.manual_seed(0)
-size = (100, 3, 224, 224)
 
 model = models.resnet18(pretrained=True).eval().to("cuda")
 enabled_precisions = {torch.float}
@@ -24,7 +23,7 @@ def remove_timing_cache(path=TIMING_CACHE_PATH):
         os.remove(path)
 
 
-def dynamo_path(iterations=3):
+def dynamo_compile(iterations=3):
     times = []
     start = torch.cuda.Event(enable_timing=True)
     end = torch.cuda.Event(enable_timing=True)
@@ -42,7 +41,7 @@ def dynamo_path(iterations=3):
     # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
     for i in range(iterations):
         inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")]
-        remove_timing_cache()  # remove timing cache for engine caching messurement
+        remove_timing_cache()  # remove timing cache just for engine caching messurement
         if i == 0:
             cache_built_engines = False
             reuse_cached_engines = False
@@ -63,11 +62,15 @@ def dynamo_path(iterations=3):
             reuse_cached_engines=reuse_cached_engines,
             engine_cache_size=1 << 30,  # 1GB
         )
+        # output = trt_gm(*inputs)
         end.record()
         torch.cuda.synchronize()
         times.append(start.elapsed_time(end))
 
-    print("-----dynamo_path-----> compilation time:\n", times, "milliseconds")
+    print("----------------dynamo_compile----------------")
+    print("disable engine caching, used:", times[0], "ms")
+    print("enable engine caching to cache engines, used:", times[1], "ms")
+    print("enable engine caching to reuse engines, used:", times[2], "ms")
 
 
 # Custom Engine Cache
@@ -84,11 +87,13 @@ def save(
         blob: bytes,
         prefix: str = "blob",
     ):
+        if not os.path.exists(self.engine_cache_dir):
+            os.makedirs(self.engine_cache_dir, exist_ok=True)
+
         path = os.path.join(
             self.engine_cache_dir,
             f"{prefix}_{hash}.bin",
         )
-        os.makedirs(path, exist_ok=True)
         with open(path, "wb") as f:
             f.write(blob)
 
@@ -101,7 +106,7 @@ def load(self, hash: str, prefix: str = "blob") -> Optional[bytes]:
         return None
 
 
-def compile_path(iterations=3):
+def torch_compile(iterations=3):
     times = []
     engine_cache = MyEngineCache("/tmp/your_dir")
     start = torch.cuda.Event(enable_timing=True)
@@ -112,8 +117,8 @@ def compile_path(iterations=3):
     # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
     # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
     for i in range(iterations):
-        inputs = [torch.rand(size).to("cuda")]
-        # remove timing cache and reset dynamo for engine caching messurement
+        inputs = [torch.rand((100, 3, 224, 224)).to("cuda")]
+        # remove timing cache and reset dynamo just for engine caching messurement
         remove_timing_cache()
         torch._dynamo.reset()
 
@@ -129,7 +134,7 @@ def compile_path(iterations=3):
             model,
             backend="tensorrt",
             options={
-                "use_python_runtime": use_python_runtime,
+                "use_python_runtime": True,
                 "enabled_precisions": enabled_precisions,
                 "debug": debug,
                 "min_block_size": min_block_size,
@@ -144,9 +149,12 @@ def compile_path(iterations=3):
         torch.cuda.synchronize()
         times.append(start.elapsed_time(end))
 
-    print("-----compile_path-----> compilation time:\n", times, "milliseconds")
+    print("----------------torch_compile----------------")
+    print("disable engine caching, used:", times[0], "ms")
+    print("enable engine caching to cache engines, used:", times[1], "ms")
+    print("enable engine caching to reuse engines, used:", times[2], "ms")
 
 
 if __name__ == "__main__":
-    dynamo_path()
-    # compile_path()
+    dynamo_compile()
+    torch_compile()
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -502,25 +502,31 @@ def run(
                         "Found the cached engine that corresponds to this graph. It is directly loaded."
                     )
 
+                    runtime = trt.Runtime(TRT_LOGGER)
+                    engine = runtime.deserialize_cuda_engine(serialized_engine)
+
                     from torch_tensorrt.dynamo._refit import (
                         _refit_single_trt_engine_with_gm,
                     )
 
-                    runtime = trt.Runtime(TRT_LOGGER)
-                    engine = runtime.deserialize_cuda_engine(serialized_engine)
-
+                    # TODO: Fast refit is problematic for now. It will fail if the engine has batch_norm layers.
+                    # We set weight_name_map=None to use slow refit anyway for now. Will fix it in the future.
                     _refit_single_trt_engine_with_gm(
                         new_gm=self.module,
                         old_engine=engine,
                         input_list=self.input_specs,
                         settings=self.compilation_settings,
-                        weight_name_map=weight_name_map,
+                        weight_name_map=None,
                     )
 
-                    serialized_engine = bytes(engine.serialize())
+                    serialized_engine = engine.serialize()
+
+                    with io.BytesIO() as engine_bytes:
+                        engine_bytes.write(serialized_engine)
+                        engine_str = engine_bytes.getvalue()
 
                     return TRTInterpreterResult(
-                        serialized_engine,
+                        engine_str,
                         self._input_names,
                         self._output_names,
                         self.weight_name_map,
diff --git a/tests/py/dynamo/models/test_engine_cache.py b/tests/py/dynamo/models/test_engine_cache.py
@@ -0,0 +1,153 @@
+# type: ignore
+import os
+import shutil
+import unittest
+from typing import Optional
+
+import torch
+import torch_tensorrt as torch_trt
+import torchvision.models as models
+from torch.testing._internal.common_utils import TestCase
+from torch_tensorrt.dynamo._defaults import ENGINE_CACHE_DIR
+from torch_tensorrt.dynamo._engine_caching import BaseEngineCache
+from torch_tensorrt.dynamo.utils import COSINE_THRESHOLD, cosine_similarity
+
+assertions = unittest.TestCase()
+
+
+class MyEngineCache(BaseEngineCache):
+    def __init__(
+        self,
+        engine_cache_dir: str,
+    ) -> None:
+        self.engine_cache_dir = engine_cache_dir
+
+    def save(
+        self,
+        hash: str,
+        blob: bytes,
+        prefix: str = "blob",
+    ):
+        if not os.path.exists(self.engine_cache_dir):
+            os.makedirs(self.engine_cache_dir, exist_ok=True)
+
+        path = os.path.join(
+            self.engine_cache_dir,
+            f"{prefix}_{hash}.bin",
+        )
+        with open(path, "wb") as f:
+            f.write(blob)
+
+    def load(self, hash: str, prefix: str = "blob") -> Optional[bytes]:
+        path = os.path.join(self.engine_cache_dir, f"{prefix}_{hash}.bin")
+        if os.path.exists(path):
+            with open(path, "rb") as f:
+                blob = f.read()
+            return blob
+        return None
+
+
+class TestEngineCache(TestCase):
+
+    def test_dynamo_compile(self):
+        model = models.resnet18(pretrained=True).eval().to("cuda")
+        example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),)
+        # Mark the dim0 of inputs as dynamic
+        batch = torch.export.Dim("batch", min=1, max=200)
+        exp_program = torch.export.export(
+            model, args=example_inputs, dynamic_shapes={"x": {0: batch}}
+        )
+        engine_cache_dir = ENGINE_CACHE_DIR
+        if os.path.exists(engine_cache_dir):
+            shutil.rmtree(engine_cache_dir)
+        # The 1st iteration is to measure the compilation time without engine caching
+        # The 2nd and 3rd iterations are to measure the compilation time with engine caching.
+        # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
+        # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
+        inputs = [torch.rand((128, 3, 224, 224)).to("cuda")]
+        results = []
+        for i in range(3):
+            if i == 0:
+                cache_built_engines = False
+                reuse_cached_engines = False
+            else:
+                cache_built_engines = True
+                reuse_cached_engines = True
+
+            trt_gm = torch_trt.dynamo.compile(
+                exp_program,
+                tuple(inputs),
+                use_python_runtime=False,
+                enabled_precisions={torch.float},
+                debug=False,
+                min_block_size=1,
+                make_refitable=True,
+                cache_built_engines=cache_built_engines,
+                reuse_cached_engines=reuse_cached_engines,
+                engine_cache_size=1 << 30,  # 1GB
+            )
+            results.append(trt_gm(*inputs))
+
+        cos_sim = cosine_similarity(results[0], results[1])
+        assertions.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"test_dynamo_compile TRT without engine caching doesn't match with that with engine caching. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
+
+        cos_sim = cosine_similarity(results[1], results[2])
+        assertions.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"test_dynamo_compile TRT with engine caching doesn't match with that cached engine. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
+
+    def test_torch_compile(self):
+        # Custom Engine Cache
+        model = models.resnet18(pretrained=True).eval().to("cuda")
+
+        engine_cache_dir = "/tmp/your_dir"
+        if os.path.exists(engine_cache_dir):
+            shutil.rmtree(engine_cache_dir)
+
+        engine_cache = MyEngineCache(engine_cache_dir)
+        # The 1st iteration is to measure the compilation time without engine caching
+        # The 2nd and 3rd iterations are to measure the compilation time with engine caching.
+        # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
+        # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
+        inputs = [torch.rand((100, 3, 224, 224)).to("cuda")]
+        results = []
+        for i in range(3):
+            # remove timing cache and reset dynamo for engine caching messurement
+            if i == 0:
+                cache_built_engines = False
+                reuse_cached_engines = False
+            else:
+                cache_built_engines = True
+                reuse_cached_engines = True
+
+            compiled_model = torch.compile(
+                model,
+                backend="tensorrt",
+                options={
+                    "use_python_runtime": True,
+                    "enabled_precisions": {torch.float},
+                    "debug": False,
+                    "min_block_size": 1,
+                    "make_refitable": True,
+                    "cache_built_engines": cache_built_engines,
+                    "reuse_cached_engines": reuse_cached_engines,
+                    "custom_engine_cache": engine_cache,  # use custom engine cache
+                },
+            )
+            results.append(compiled_model(*inputs))  # trigger the compilation
+
+        cos_sim = cosine_similarity(results[0], results[1])
+        assertions.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"test_torch_compile TRT without engine caching doesn't match with that with engine caching. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
+
+        cos_sim = cosine_similarity(results[1], results[2])
+        assertions.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"test_torch_compile TRT with engine caching doesn't match with that cached engine. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )