support customizing engine cache class

zewenli98 · zewenli98 · commit 37d3311825c7 · 2024-07-18T15:25:50.000-07:00
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -3,7 +3,7 @@
 import collections.abc
 import logging
 import warnings
-from typing import Any, Collection, List, Optional, Sequence, Set, Tuple, Union
+from typing import Any, Collection, List, Optional, Sequence, Set, Tuple, Type, Union
 
 import torch
 from torch.export import ExportedProgram
@@ -18,6 +18,7 @@
     dryrun_stats_display,
     parse_non_trt_nodes,
 )
+from torch_tensorrt.dynamo._engine_caching import BaseEngineCache, EngineCache
 from torch_tensorrt.dynamo.conversion import (
     CompilationSettings,
     UnsupportedOperatorException,
@@ -83,6 +84,7 @@ def compile(
     load_engine_cache: bool = _defaults.LOAD_ENGINE_CACHE,
     engine_cache_dir: str = _defaults.ENGINE_CACHE_DIR,
     engine_cache_size: int = _defaults.ENGINE_CACHE_SIZE,
+    engine_cache_class: Type[BaseEngineCache] = EngineCache,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -147,6 +149,7 @@ def compile(
         load_engine_cache (bool): Whether to load the compiled TRT engines from hard disk
         engine_cache_dir (str): Directory to store the cached TRT engines
         engine_cache_size (int): Maximum hard-disk space to use for the engine cache
+        engine_cache_class (BaseEngineCache): Engine cache class to use for saving and loading engines. Users can provide their own engine cache class by inheriting from BaseEngineCache
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -246,6 +249,7 @@ def compile(
         "load_engine_cache": load_engine_cache,
         "engine_cache_dir": engine_cache_dir,
         "engine_cache_size": engine_cache_size,
+        "engine_cache_class": engine_cache_class,
     }
 
     settings = CompilationSettings(**compilation_options)
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -4,6 +4,7 @@
 import torch
 from torch_tensorrt._Device import Device
 from torch_tensorrt._enums import EngineCapability, dtype
+from torch_tensorrt.dynamo._engine_caching import EngineCache
 
 ENABLED_PRECISIONS = {dtype.f32}
 DEBUG = False
@@ -36,6 +37,7 @@
 LOAD_ENGINE_CACHE = True
 ENGINE_CACHE_DIR = os.path.join(tempfile.gettempdir(), "torch_tensorrt_engine_cache")
 ENGINE_CACHE_SIZE = 1 << 30
+ENGINE_CACHE_CLASS = EngineCache
 
 
 def default_device() -> Device:
diff --git a/py/torch_tensorrt/dynamo/_engine_caching.py b/py/torch_tensorrt/dynamo/_engine_caching.py
@@ -3,18 +3,25 @@
 import logging
 import os
 from abc import ABC, abstractmethod
-from typing import List, Optional, Tuple, cast
+from typing import Any, List, Optional, Tuple, cast
 
 import torch
 from torch._inductor.codecache import FxGraphCachePickler
 from torch.fx.experimental.proxy_tensor import maybe_disable_fake_tensor_mode
-from torch_tensorrt.dynamo._defaults import ENGINE_CACHE_DIR, ENGINE_CACHE_SIZE
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
 
 
 class BaseEngineCache(ABC):
 
+    @abstractmethod
+    def __init__(
+        self,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        pass
+
     @staticmethod
     def get_hash(gm: torch.fx.GraphModule) -> str:
         """Get the hash value of the GraphModule
@@ -42,7 +49,7 @@ def save(
         serialized_engine: bytes,
         input_names: List[str],
         output_names: List[str],
-    ) -> None:
+    ) -> bool:
         """Save the serialized engine to hard disk
 
         Args:
@@ -52,7 +59,7 @@ def save(
             output_names (List[str]): output names of TRT engine
 
         Returns:
-            None
+            bool: whether the serialized engine is saved successfully
         """
         pass
 
@@ -82,8 +89,8 @@ class EngineCache(BaseEngineCache):
 
     def __init__(
         self,
-        engine_cache_size: int = ENGINE_CACHE_SIZE,
-        engine_cache_dir: str = ENGINE_CACHE_DIR,
+        engine_cache_size: int,
+        engine_cache_dir: str,
     ) -> None:
         self.total_engine_cache_size = engine_cache_size
         self.available_engine_cache_size = engine_cache_size
@@ -98,7 +105,7 @@ def has_available_cache_size(self, serialized_engine: bytes) -> bool:
         Returns:
             bool: whether the cache has available size for the serialized engine
         """
-        return serialized_engine.nbytes <= self.available_engine_cache_size
+        return int(serialized_engine.nbytes) <= self.available_engine_cache_size
 
     def clear_cache(self, size: int) -> None:
 
@@ -113,13 +120,13 @@ def save(
         serialized_engine: bytes,
         input_names: List[str],
         output_names: List[str],
-    ) -> None:
-        serialized_engine_size = serialized_engine.nbytes
+    ) -> bool:
+        serialized_engine_size = int(serialized_engine.nbytes)
         if serialized_engine_size > self.total_engine_cache_size:
             _LOGGER.warning(
                 f"The serialized engine cannot be saved because the size of the engine {serialized_engine_size} is larger than the total cache size {self.total_engine_cache_size}."
             )
-            return
+            return False
 
         if not self.has_available_cache_size(serialized_engine):
             self.clear_cache(serialized_engine_size)
@@ -129,10 +136,23 @@ def save(
                 self.engine_cache_dir,
                 f"{hash}/engine--{input_names}--{output_names}.trt",
             )
-            os.makedirs(os.path.dirname(path), exist_ok=True)
-            with open(path, "wb") as f:
-                f.write(serialized_engine)
+            try:
+                os.makedirs(os.path.dirname(path), exist_ok=True)
+                with open(path, "wb") as f:
+                    f.write(serialized_engine)
+            except Exception as e:
+                _LOGGER.warning(f"Failed to save the TRT engine to {path}: {e}")
+                return False
+
             _LOGGER.info(f"A TRT engine was cached to {path}")
+            self.available_engine_cache_size -= serialized_engine_size
+            return True
+
+        else:
+            _LOGGER.warning(
+                f"The serialized engine {serialized_engine_size} is still larger than the available cache size {self.available_engine_cache_size}."
+            )
+            return False
 
     def load(self, hash: str) -> Tuple[Optional[bytes], List[str], List[str]]:
         directory = os.path.join(self.engine_cache_dir, hash)
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass, field
-from typing import Collection, Optional, Set, Union
+from typing import Collection, Optional, Set, Type, Union
 
 from torch.fx.node import Target
 from torch_tensorrt._Device import Device
@@ -14,6 +14,7 @@
     DRYRUN,
     ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
     ENABLED_PRECISIONS,
+    ENGINE_CACHE_CLASS,
     ENGINE_CACHE_DIR,
     ENGINE_CACHE_SIZE,
     ENGINE_CAPABILITY,
@@ -36,6 +37,7 @@
     WORKSPACE_SIZE,
     default_device,
 )
+from torch_tensorrt.dynamo._engine_caching import BaseEngineCache
 
 
 @dataclass
@@ -81,6 +83,7 @@ class CompilationSettings:
         load_engine_cache (bool): Whether to load the compiled TRT engines from hard disk
         engine_cache_dir (str): Directory to store the cached TRT engines
         engine_cache_size (int): Maximum hard-disk space to use for the engine cache
+        engine_cache_class (BaseEngineCache): Engine cache class to use for saving and loading engines. Users can provide their own engine cache class by inheriting from BaseEngineCache
     """
 
     enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS)
@@ -116,3 +119,4 @@ class CompilationSettings:
     load_engine_cache: bool = LOAD_ENGINE_CACHE
     engine_cache_dir: str = ENGINE_CACHE_DIR
     engine_cache_size: int = ENGINE_CACHE_SIZE
+    engine_cache_class: Type[BaseEngineCache] = ENGINE_CACHE_CLASS
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -14,7 +14,6 @@
 from torch_tensorrt._enums import dtype
 from torch_tensorrt._Input import Input
 from torch_tensorrt.dynamo import _defaults
-from torch_tensorrt.dynamo._engine_caching import EngineCache
 from torch_tensorrt.dynamo._settings import CompilationSettings
 from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext
 from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
@@ -339,11 +338,13 @@ def run(
             self.compilation_settings.save_engine_cache
             or self.compilation_settings.load_engine_cache
         ):
-            engine_cache = EngineCache(
+            engine_cache = self.compilation_settings.engine_cache_class(
                 self.compilation_settings.engine_cache_size,
                 self.compilation_settings.engine_cache_dir,
             )
-            hash_val = EngineCache.get_hash(self.module)
+            hash_val = self.compilation_settings.engine_cache_class.get_hash(
+                self.module
+            )
 
         if self.compilation_settings.load_engine_cache:
             # query the cached TRT engine