add save_engine_cache and load_engine_cache args

zewenli98 · zewenli98 · commit 65e0171a3b56 · 2024-07-17T17:15:47.000-07:00
diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py
@@ -8,16 +8,12 @@
 np.random.seed(0)
 torch.manual_seed(0)
 size = (100, 3, 224, 224)
-inputs = [torch.rand(size).to("cuda")]
 
 model = models.resnet18(pretrained=True).eval().to("cuda")
-exp_program = torch.export.export(model, tuple(inputs))
 enabled_precisions = {torch.float}
 debug = False
-workspace_size = 20 << 30
-min_block_size = 0
+min_block_size = 1
 use_python_runtime = False
-torch_executed_ops = {}
 TIMING_CACHE_PATH = "/tmp/timing_cache.bin"
 
 
@@ -27,17 +23,20 @@ def remove_timing_cache(path=TIMING_CACHE_PATH):
 
 
 def dynamo_path(iterations=3):
-    outputs = []
     times = []
     start = torch.cuda.Event(enable_timing=True)
     end = torch.cuda.Event(enable_timing=True)
+    inputs = [torch.rand(size).to("cuda")]
+    exp_program = torch.export.export(model, tuple(inputs))
     for i in range(iterations):
         inputs = [torch.rand(size).to("cuda")]
-        remove_timing_cache()
-        if i == 0:  # warmup
-            ignore_engine_cache = True
+        remove_timing_cache()  # remove timing cache for engine caching messurement
+        if i == 0:
+            save_engine_cache = False
+            load_engine_cache = False
         else:
-            ignore_engine_cache = False
+            save_engine_cache = True
+            load_engine_cache = True
 
         start.record()
         trt_gm = torch_trt.dynamo.compile(
@@ -47,18 +46,57 @@ def dynamo_path(iterations=3):
             enabled_precisions=enabled_precisions,
             debug=debug,
             min_block_size=min_block_size,
-            torch_executed_ops=torch_executed_ops,
             make_refitable=True,
-            ignore_engine_cache=ignore_engine_cache,
+            save_engine_cache=save_engine_cache,
+            load_engine_cache=load_engine_cache,
+        )
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+
+    print("-----dynamo_path-----> compilation time:", times, "milliseconds")
+
+
+def compile_path(iterations=3):
+    times = []
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    for i in range(iterations):
+        inputs = [torch.rand(size).to("cuda")]
+        # remove timing cache and reset dynamo for engine caching messurement
+        remove_timing_cache()
+        torch._dynamo.reset()
+
+        if i == 0:
+            save_engine_cache = False
+            load_engine_cache = False
+        else:
+            save_engine_cache = True
+            load_engine_cache = True
+
+        start.record()
+        compiled_model = torch.compile(
+            model,
+            backend="tensorrt",
+            options={
+                "use_python_runtime": use_python_runtime,
+                "enabled_precisions": enabled_precisions,
+                "debug": debug,
+                "min_block_size": min_block_size,
+                "make_refitable": True,
+                "save_engine_cache": save_engine_cache,
+                "load_engine_cache": load_engine_cache,
+            },
         )
+        compiled_model(*inputs)  # trigger the compilation
         end.record()
         torch.cuda.synchronize()
         times.append(start.elapsed_time(end))
-        outputs.append(trt_gm(*inputs))
 
-    print("-----dynamo_path-----> output:", outputs)
-    print("-----dynamo_path-----> compilation time:", times, "seconds")
+    print("-----compile_path-----> compilation time:", times, "milliseconds")
 
 
 if __name__ == "__main__":
     dynamo_path()
+    compile_path()
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -79,7 +79,8 @@ def compile(
     dryrun: bool = _defaults.DRYRUN,
     hardware_compatible: bool = _defaults.HARDWARE_COMPATIBLE,
     timing_cache_path: str = _defaults.TIMING_CACHE_PATH,
-    ignore_engine_cache: bool = _defaults.IGNORE_ENGINE_CACHE,
+    save_engine_cache: bool = _defaults.SAVE_ENGINE_CACHE,
+    load_engine_cache: bool = _defaults.LOAD_ENGINE_CACHE,
     engine_cache_dir: str = _defaults.ENGINE_CACHE_DIR,
     engine_cache_size: int = _defaults.ENGINE_CACHE_SIZE,
     **kwargs: Any,
@@ -142,7 +143,8 @@ def compile(
         dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs
         hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer)
         timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation
-        ignore_engine_cache (bool): Whether to ignore the cached TRT engines and recompile the module
+        save_engine_cache (bool): Whether to save the compiled TRT engines to hard disk
+        load_engine_cache (bool): Whether to load the compiled TRT engines from hard disk
         engine_cache_dir (str): Directory to store the cached TRT engines
         engine_cache_size (int): Maximum hard-disk space to use for the engine cache
         **kwargs: Any,
@@ -240,7 +242,8 @@ def compile(
         "dryrun": dryrun,
         "hardware_compatible": hardware_compatible,
         "timing_cache_path": timing_cache_path,
-        "ignore_engine_cache": ignore_engine_cache,
+        "save_engine_cache": save_engine_cache,
+        "load_engine_cache": load_engine_cache,
         "engine_cache_dir": engine_cache_dir,
         "engine_cache_size": engine_cache_size,
     }
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -32,7 +32,8 @@
 HARDWARE_COMPATIBLE = False
 SUPPORTED_KERNEL_PRECISIONS = {dtype.f32, dtype.f16, dtype.bf16, dtype.i8, dtype.f8}
 TIMING_CACHE_PATH = os.path.join(tempfile.gettempdir(), "timing_cache.bin")
-IGNORE_ENGINE_CACHE = False
+SAVE_ENGINE_CACHE = True
+LOAD_ENGINE_CACHE = True
 ENGINE_CACHE_DIR = os.path.join(tempfile.gettempdir(), "torch_tensorrt_engine_cache")
 ENGINE_CACHE_SIZE = 1 << 30
 
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -18,14 +18,15 @@
     ENGINE_CACHE_SIZE,
     ENGINE_CAPABILITY,
     HARDWARE_COMPATIBLE,
-    IGNORE_ENGINE_CACHE,
+    LOAD_ENGINE_CACHE,
     MAKE_REFITABLE,
     MAX_AUX_STREAMS,
     MIN_BLOCK_SIZE,
     NUM_AVG_TIMING_ITERS,
     OPTIMIZATION_LEVEL,
     PASS_THROUGH_BUILD_FAILURES,
     REQUIRE_FULL_COMPILATION,
+    SAVE_ENGINE_CACHE,
     SPARSE_WEIGHTS,
     TIMING_CACHE_PATH,
     TRUNCATE_DOUBLE,
@@ -76,7 +77,8 @@ class CompilationSettings:
             ouptut to a file if a string path is specified
         hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer)
         timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation
-        ignore_engine_cache (bool): Whether to ignore the cached TRT engines and recompile the module
+        save_engine_cache (bool): Whether to save the compiled TRT engines to hard disk
+        load_engine_cache (bool): Whether to load the compiled TRT engines from hard disk
         engine_cache_dir (str): Directory to store the cached TRT engines
         engine_cache_size (int): Maximum hard-disk space to use for the engine cache
     """
@@ -110,6 +112,7 @@ class CompilationSettings:
     dryrun: Union[bool, str] = DRYRUN
     hardware_compatible: bool = HARDWARE_COMPATIBLE
     timing_cache_path: str = TIMING_CACHE_PATH
-    ignore_engine_cache: bool = IGNORE_ENGINE_CACHE
+    save_engine_cache: bool = SAVE_ENGINE_CACHE
+    load_engine_cache: bool = LOAD_ENGINE_CACHE
     engine_cache_dir: str = ENGINE_CACHE_DIR
     engine_cache_size: int = ENGINE_CACHE_SIZE
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -331,16 +331,22 @@ def run(
         Args:
             strict_type_constraints: Usually we should set it to False unless we want to control the precision of certain layer for numeric reasons.
             algorithm_selector: set up algorithm selection for certain layer
+            tactic_sources: set up tactic sources for certain layer
         Return:
             TRTInterpreterResult
         """
-        if not self.compilation_settings.ignore_engine_cache:
-            # query the cached TRT engine
+        if (
+            self.compilation_settings.save_engine_cache
+            or self.compilation_settings.load_engine_cache
+        ):
             engine_cache = EngineCache(
                 self.compilation_settings.engine_cache_size,
                 self.compilation_settings.engine_cache_dir,
             )
             hash_val = EngineCache.get_hash(self.module)
+
+        if self.compilation_settings.load_engine_cache:
+            # query the cached TRT engine
             serialized_engine, input_names, output_names = engine_cache.load(hash_val)
             if serialized_engine is not None:
                 self._input_names = input_names
@@ -390,7 +396,7 @@ def run(
         self._save_timing_cache(
             builder_config, self.compilation_settings.timing_cache_path
         )
-        if not self.compilation_settings.ignore_engine_cache:
+        if self.compilation_settings.save_engine_cache:
             engine_cache.save(
                 hash_val, serialized_engine, self._input_names, self._output_names
             )