refactor and add LRU to clear cache

zewenli98 · zewenli98 · commit db2a523dee36 · 2024-07-19T17:42:34.000-07:00
diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py
@@ -1,9 +1,17 @@
+import ast
+import logging
 import os
+from typing import List, Optional, Tuple
 
 import numpy as np
 import torch
 import torch_tensorrt as torch_trt
 import torchvision.models as models
+from torch_tensorrt.dynamo._defaults import TIMING_CACHE_PATH
+from torch_tensorrt.dynamo._engine_caching import BaseEngineCache
+
+_LOGGER: logging.Logger = logging.getLogger(__name__)
+
 
 np.random.seed(0)
 torch.manual_seed(0)
@@ -14,7 +22,6 @@
 debug = False
 min_block_size = 1
 use_python_runtime = False
-TIMING_CACHE_PATH = "/tmp/timing_cache.bin"
 
 
 def remove_timing_cache(path=TIMING_CACHE_PATH):
@@ -26,10 +33,16 @@ def dynamo_path(iterations=3):
     times = []
     start = torch.cuda.Event(enable_timing=True)
     end = torch.cuda.Event(enable_timing=True)
-    inputs = [torch.rand(size).to("cuda")]
-    exp_program = torch.export.export(model, tuple(inputs))
+
+    example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),)
+    # Mark the dim0 of inputs as dynamic
+    batch = torch.export.Dim("batch", min=1, max=200)
+    exp_program = torch.export.export(
+        model, args=example_inputs, dynamic_shapes={"x": {0: batch}}
+    )
+
     for i in range(iterations):
-        inputs = [torch.rand(size).to("cuda")]
+        inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")]
         remove_timing_cache()  # remove timing cache for engine caching messurement
         if i == 0:
             save_engine_cache = False
@@ -49,6 +62,7 @@ def dynamo_path(iterations=3):
             make_refitable=True,
             save_engine_cache=save_engine_cache,
             load_engine_cache=load_engine_cache,
+            engine_cache_size=1 << 30,  # 1GB
         )
         end.record()
         torch.cuda.synchronize()
@@ -57,8 +71,65 @@ def dynamo_path(iterations=3):
     print("-----dynamo_path-----> compilation time:", times, "milliseconds")
 
 
+# Custom Engine Cache
+class MyEngineCache(BaseEngineCache):
+
+    def __init__(
+        self,
+        engine_cache_size: int,
+        engine_cache_dir: str,
+    ) -> None:
+        self.total_engine_cache_size = engine_cache_size
+        self.available_engine_cache_size = engine_cache_size
+        self.engine_cache_dir = engine_cache_dir
+
+    def save(
+        self,
+        hash: str,
+        serialized_engine: bytes,
+        input_names: List[str],
+        output_names: List[str],
+    ) -> bool:
+        path = os.path.join(
+            self.engine_cache_dir,
+            f"{hash}/engine--{input_names}--{output_names}.trt",
+        )
+        try:
+            os.makedirs(os.path.dirname(path), exist_ok=True)
+            with open(path, "wb") as f:
+                f.write(serialized_engine)
+        except Exception as e:
+            _LOGGER.warning(f"Failed to save the TRT engine to {path}: {e}")
+            return False
+
+        _LOGGER.info(f"A TRT engine was cached to {path}")
+        serialized_engine_size = int(serialized_engine.nbytes)
+        self.available_engine_cache_size -= serialized_engine_size
+        return True
+
+    def load(self, hash: str) -> Tuple[Optional[bytes], List[str], List[str]]:
+        directory = os.path.join(self.engine_cache_dir, hash)
+        if os.path.exists(directory):
+            engine_list = os.listdir(directory)
+            assert (
+                len(engine_list) == 1
+            ), f"There are more than one engine {engine_list} under {directory}."
+            path = os.path.join(directory, engine_list[0])
+            input_names_str, output_names_str = (
+                engine_list[0].split(".trt")[0].split("--")[1:]
+            )
+            input_names = ast.literal_eval(input_names_str)
+            output_names = ast.literal_eval(output_names_str)
+            with open(path, "rb") as f:
+                serialized_engine = f.read()
+                return serialized_engine, input_names, output_names
+        else:
+            return None, [], []
+
+
 def compile_path(iterations=3):
     times = []
+    engine_cache = MyEngineCache(200 * (1 << 20), "/tmp/your_dir")
     start = torch.cuda.Event(enable_timing=True)
     end = torch.cuda.Event(enable_timing=True)
 
@@ -87,6 +158,7 @@ def compile_path(iterations=3):
                 "make_refitable": True,
                 "save_engine_cache": save_engine_cache,
                 "load_engine_cache": load_engine_cache,
+                "engine_cache_instance": engine_cache,  # use custom engine cache
             },
         )
         compiled_model(*inputs)  # trigger the compilation
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -3,7 +3,7 @@
 import collections.abc
 import logging
 import warnings
-from typing import Any, Collection, List, Optional, Sequence, Set, Tuple, Type, Union
+from typing import Any, Collection, List, Optional, Sequence, Set, Tuple, Union
 
 import torch
 from torch.export import ExportedProgram
@@ -84,7 +84,7 @@ def compile(
     load_engine_cache: bool = _defaults.LOAD_ENGINE_CACHE,
     engine_cache_dir: str = _defaults.ENGINE_CACHE_DIR,
     engine_cache_size: int = _defaults.ENGINE_CACHE_SIZE,
-    engine_cache_class: Type[BaseEngineCache] = EngineCache,
+    engine_cache_instance: Optional[BaseEngineCache] = None,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -149,7 +149,7 @@ def compile(
         load_engine_cache (bool): Whether to load the compiled TRT engines from hard disk
         engine_cache_dir (str): Directory to store the cached TRT engines
         engine_cache_size (int): Maximum hard-disk space to use for the engine cache
-        engine_cache_class (BaseEngineCache): Engine cache class to use for saving and loading engines. Users can provide their own engine cache class by inheriting from BaseEngineCache
+        engine_cache_instance (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -213,6 +213,11 @@ def compile(
     gm = post_lowering(gm, torch_inputs)
     logger.debug("Lowered Input graph: " + str(gm.graph))
 
+    if engine_cache_instance is None:
+        engine_cache_instance = EngineCacheInstanceCreator.get_creator(
+            engine_cache_size, engine_cache_dir
+        ).engine_cache_instance
+
     compilation_options = {
         "enabled_precisions": (
             enabled_precisions if enabled_precisions else _defaults.ENABLED_PRECISIONS
@@ -249,7 +254,7 @@ def compile(
         "load_engine_cache": load_engine_cache,
         "engine_cache_dir": engine_cache_dir,
         "engine_cache_size": engine_cache_size,
-        "engine_cache_class": engine_cache_class,
+        "engine_cache_instance": engine_cache_instance,
     }
 
     settings = CompilationSettings(**compilation_options)
@@ -666,3 +671,21 @@ def convert_module_to_trt_engine(
         engine_bytearray = engine_bytes.getvalue()
 
     return engine_bytearray
+
+
+class EngineCacheInstanceCreator:
+    engine_cache_creator = None
+
+    def __init__(self, engine_cache_size: int, engine_cache_dir: str) -> None:
+        self.engine_cache_instance = EngineCache(
+            engine_cache_size=engine_cache_size,
+            engine_cache_dir=engine_cache_dir,
+        )
+
+    @classmethod
+    def get_creator(
+        cls, engine_cache_size: int, engine_cache_dir: str
+    ) -> EngineCacheInstanceCreator:
+        if cls.engine_cache_creator is None:
+            cls.engine_cache_creator = cls(engine_cache_size, engine_cache_dir)
+        return cls.engine_cache_creator
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -32,12 +32,16 @@
 DRYRUN = False
 HARDWARE_COMPATIBLE = False
 SUPPORTED_KERNEL_PRECISIONS = {dtype.f32, dtype.f16, dtype.bf16, dtype.i8, dtype.f8}
-TIMING_CACHE_PATH = os.path.join(tempfile.gettempdir(), "timing_cache.bin")
+TIMING_CACHE_PATH = os.path.join(
+    tempfile.gettempdir(), "torch_tensorrt_engine_cache", "timing_cache.bin"
+)
 SAVE_ENGINE_CACHE = True
 LOAD_ENGINE_CACHE = True
 ENGINE_CACHE_DIR = os.path.join(tempfile.gettempdir(), "torch_tensorrt_engine_cache")
-ENGINE_CACHE_SIZE = 1 << 30
-ENGINE_CACHE_CLASS = EngineCache
+ENGINE_CACHE_SIZE = 1073741824
+ENGINE_CACHE_INSTANCE = EngineCache(
+    engine_cache_size=ENGINE_CACHE_SIZE, engine_cache_dir=ENGINE_CACHE_DIR
+)
 
 
 def default_device() -> Device:
diff --git a/py/torch_tensorrt/dynamo/_engine_caching.py b/py/torch_tensorrt/dynamo/_engine_caching.py
@@ -2,8 +2,9 @@
 import copy
 import logging
 import os
+import shutil
 from abc import ABC, abstractmethod
-from typing import Any, List, Optional, Tuple, cast
+from typing import Any, Dict, List, Optional, Tuple, cast
 
 import torch
 from torch._inductor.codecache import FxGraphCachePickler
@@ -75,15 +76,6 @@ def load(self, hash: str) -> Tuple[Optional[bytes], List[str], List[str]]:
         """
         pass
 
-    @abstractmethod
-    def clear_cache(self, size: int) -> None:
-        """Clear the cache to make sure at least `size` bytes are available
-
-        Args:
-            size (int): the needed size
-        """
-        pass
-
 
 class EngineCache(BaseEngineCache):
 
@@ -95,6 +87,7 @@ def __init__(
         self.total_engine_cache_size = engine_cache_size
         self.available_engine_cache_size = engine_cache_size
         self.engine_cache_dir = engine_cache_dir
+        self.hash2size_map: Dict[str, int] = {}
 
     def has_available_cache_size(self, serialized_engine: bytes) -> bool:
         """Check if the cache has available space for saving the serialized engine
@@ -107,12 +100,53 @@ def has_available_cache_size(self, serialized_engine: bytes) -> bool:
         """
         return int(serialized_engine.nbytes) <= self.available_engine_cache_size
 
-    def clear_cache(self, size: int) -> None:
+    def clear_cache(self, needed_min_size: int) -> bool:
+        """Clear the cache to make sure at least `needed_min_size` bytes are available, if possible
 
-        def LRU() -> None:
-            pass
+        Args:
+            needed_min_size (int): the minimum needed size
 
-        pass
+        Returns:
+            bool: whether the cache is cleared successfully
+        """
+
+        def LRU() -> bool:
+            """Clear the Least Recently Used engine in the cache"""
+            # Get the list of engine directories
+            engines_hash_values = os.listdir(self.engine_cache_dir)
+            # Sort the engine directories by modification time (oldest first)
+            engines_hash_values.sort(
+                key=lambda x: os.path.getmtime(os.path.join(self.engine_cache_dir, x))
+            )
+            # Iterate over the engine directories and remove the oldest ones until enough space is available
+            for engine_hash in engines_hash_values:
+                if self.available_engine_cache_size >= needed_min_size:
+                    break
+                engine_path = os.path.join(self.engine_cache_dir, engine_hash)
+                try:
+                    # Remove the entire directory
+                    shutil.rmtree(engine_path)
+                    # Update the available cache size
+                    self.available_engine_cache_size += self.hash2size_map.pop(
+                        engine_hash, 0
+                    )
+                    _LOGGER.info(
+                        f"Removed the engine cache at {engine_path}, available cache size: {self.available_engine_cache_size} bytes."
+                    )
+                except Exception as e:
+                    _LOGGER.warning(
+                        f"Failed to clear the engine cache at {engine_path}: {e}"
+                    )
+                    return False
+            return True
+
+        if not os.path.exists(self.engine_cache_dir):
+            return False
+
+        _LOGGER.info(
+            f"Total cache size: {self.total_engine_cache_size} bytes; available cache size: {self.available_engine_cache_size} bytes. Clearing the cache to make sure at least {needed_min_size} bytes are available."
+        )
+        return LRU()
 
     def save(
         self,
@@ -128,9 +162,11 @@ def save(
             )
             return False
 
+        # Check if there is enough available cache size for the serialized engine
         if not self.has_available_cache_size(serialized_engine):
             self.clear_cache(serialized_engine_size)
 
+        # Save the serialized engine to the cache directory
         if self.has_available_cache_size(serialized_engine):
             path = os.path.join(
                 self.engine_cache_dir,
@@ -140,12 +176,14 @@ def save(
                 os.makedirs(os.path.dirname(path), exist_ok=True)
                 with open(path, "wb") as f:
                     f.write(serialized_engine)
+                self.hash2size_map[hash] = serialized_engine_size
+                self.available_engine_cache_size -= serialized_engine_size
+                _LOGGER.info(f"A TRT engine was cached to {path}")
+
             except Exception as e:
                 _LOGGER.warning(f"Failed to save the TRT engine to {path}: {e}")
                 return False
 
-            _LOGGER.info(f"A TRT engine was cached to {path}")
-            self.available_engine_cache_size -= serialized_engine_size
             return True
 
         else:
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass, field
-from typing import Collection, Optional, Set, Type, Union
+from typing import Collection, Optional, Set, Union
 
 from torch.fx.node import Target
 from torch_tensorrt._Device import Device
@@ -14,8 +14,8 @@
     DRYRUN,
     ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
     ENABLED_PRECISIONS,
-    ENGINE_CACHE_CLASS,
     ENGINE_CACHE_DIR,
+    ENGINE_CACHE_INSTANCE,
     ENGINE_CACHE_SIZE,
     ENGINE_CAPABILITY,
     HARDWARE_COMPATIBLE,
@@ -83,7 +83,7 @@ class CompilationSettings:
         load_engine_cache (bool): Whether to load the compiled TRT engines from hard disk
         engine_cache_dir (str): Directory to store the cached TRT engines
         engine_cache_size (int): Maximum hard-disk space to use for the engine cache
-        engine_cache_class (BaseEngineCache): Engine cache class to use for saving and loading engines. Users can provide their own engine cache class by inheriting from BaseEngineCache
+        engine_cache_instance (BaseEngineCache): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache
     """
 
     enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS)
@@ -119,4 +119,4 @@ class CompilationSettings:
     load_engine_cache: bool = LOAD_ENGINE_CACHE
     engine_cache_dir: str = ENGINE_CACHE_DIR
     engine_cache_size: int = ENGINE_CACHE_SIZE
-    engine_cache_class: Type[BaseEngineCache] = ENGINE_CACHE_CLASS
+    engine_cache_instance: BaseEngineCache = ENGINE_CACHE_INSTANCE
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -303,6 +303,7 @@ def _save_timing_cache(
         This is called after a TensorRT engine is built. Save the timing cache
         """
         timing_cache = builder_config.get_timing_cache()
+        os.makedirs(os.path.dirname(timing_cache_path), exist_ok=True)
         with open(timing_cache_path, "wb") as timing_cache_file:
             timing_cache_file.write(memoryview(timing_cache.serialize()))
 
@@ -338,13 +339,8 @@ def run(
             self.compilation_settings.save_engine_cache
             or self.compilation_settings.load_engine_cache
         ):
-            engine_cache = self.compilation_settings.engine_cache_class(
-                self.compilation_settings.engine_cache_size,
-                self.compilation_settings.engine_cache_dir,
-            )
-            hash_val = self.compilation_settings.engine_cache_class.get_hash(
-                self.module
-            )
+            engine_cache = self.compilation_settings.engine_cache_instance
+            hash_val = engine_cache.get_hash(self.module)
 
         if self.compilation_settings.load_engine_cache:
             # query the cached TRT engine