[AOTI] Remove the original model weights in Python deployment

desertfire · desertfire · commit 962ec0d913bb · 2024-10-31T19:48:28.000-07:00
Summary: Fixes #1302. Because AOTI-compiled model contains a copy of model weights, we need to release the corresponding eager model weights in the Python deployment path.
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
@@ -544,6 +544,19 @@ def _initialize_model(
             # attributes will NOT be seen on by AOTI-compiled forward
             # function, e.g. calling model.setup_cache will NOT touch
             # AOTI compiled and maintained model buffers such as kv_cache.
+            # Using cpp runner to run AOTI compiled model is recommended.
+            #
+            # Released the loaded model to free up device memory.
+            # The AOTI-compiled model contains a copy of the model weights.
+            model.model = None
+            import gc
+            gc.collect()
+            torch.cuda.empty_cache()
+
+            def do_nothing(max_batch_size, max_seq_length):
+                pass
+            model.setup_caches = do_nothing
+
             model.forward = torch._export.aot_load(
                 str(builder_args.dso_path.absolute()), builder_args.device
             )