Create model with device='meta'

lucylq · facebook-github-bot · commit d5f898dbf163 · 2024-03-13T16:40:32.000-07:00
Summary: See discussion: D54825007 Two optimizations: 1. Use `mmap=True` to load the checkpoint. 2. Create model with device="meta". Tensors created in this context do not carry data. Previously, llama7b model was created with fp32 (default), using up 25GB ram. With device="meta", tensors are assigned only when we load the state dict. - Note: non-persistent buffers and tensors that do not have keys in the state dict will be created with device="meta" as well. These have to be manually initialized when creating the model. See D46784302. Checkpoint loading time: 10s -> 0.011s Peak memory usage: [37.8GB](https://lookaside.facebook.com/intern/diff/file/data/?number=1467921211&download=1) ->[25.5GB](https://lookaside.facebook.com/intern/diff/file/data/?number=1468357208&download=1) Model creation time:[ 77s](https://lookaside.facebook.com/intern/diff/file/data/?number=1468360493&download=1) -> [11.6s](https://lookaside.facebook.com/intern/diff/file/data/?number=1468364061&download=1) Follow on: iterate over params/buffers and initialize uninitialized tensors (instead of manually initializing, which is model-specific) T182328293 thanks iseeyuan for the tips: https://pytorch.org/tutorials/recipes/recipes/module_load_state_dict_tips.html bypass-github-export-checks Reviewed By: iseeyuan Differential Revision: D54871495 fbshipit-source-id: f6a8d01c88ce45bb5d2358cc522ba81c0e1fbd5b
diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py
@@ -110,7 +110,9 @@ def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
 
 
 def precompute_freqs_cis(dim: int, end: int, theta: float):
-    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    freqs = 1.0 / (
+        theta ** (torch.arange(0, dim, 2, device="cpu")[: (dim // 2)].float() / dim)
+    )
     t = torch.arange(end, device=freqs.device)  # pyre-ignore
     freqs = torch.outer(t, freqs).float()  # pyre-ignore
     freqs_cos = torch.cos(freqs)
@@ -171,6 +173,7 @@ def __init__(self, args: ModelArgs, layer_id: int):
         mask = torch.full(
             (1, 1, args.max_seq_len, args.max_seq_len),
             float("-inf"),
+            device="cpu",
         )
 
         mask = torch.triu(mask, diagonal=1)
diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py
@@ -70,7 +70,7 @@ def __init__(self, **kwargs):
         # Follow the instruction in https://github.com/facebookresearch/llama to download the model
         device = "cpu"
         # flake8: noqa: TOR102
-        checkpoint = torch.load(checkpoint_path, map_location=device)
+        checkpoint = torch.load(checkpoint_path, map_location=device, mmap=True)
         fairseq2_checkpoint = kwargs.get("fairseq2", False)
         if fairseq2_checkpoint:
             print("Using fairseq2 checkpoint")
@@ -130,7 +130,11 @@ def __init__(self, **kwargs):
             for key, weights in checkpoint.items():
                 print(f"{key} : {weights.numel()} : {weights.size()}")
             print("============= /weights ================")
-        self.model_ = Transformer(model_args)
+
+        # Within the device="meta" context, tensors that are created do not carry data.
+        # They possess all other metadata a tensor carries such as size, stride, requires_grad.
+        with torch.device("meta"):
+            self.model_ = Transformer(model_args)
 
         if "int8" in str(checkpoint_path):
             print("Using int8 weight-only quantization!")
@@ -142,11 +146,16 @@ def __init__(self, **kwargs):
             print("Using int4 weight-only quantization!")
             from .quantize import Int8DynActInt4WeightQuantHandler
 
-            simple_quantizer = INt8dynactint4weightquanthandler(self.model_)
+            simple_quantizer = Int8DynActInt4WeightQuantHandler(self.model_)
             self.model_ = simple_quantizer.convert_for_runtime()
 
+        # assign=True: load params/buffers by assignment instead of performing an in-place copy.
+        # Because we are using device="meta", tensors do not have memory associated with them
+        # and an in-place copy is a no-op. Use assign=True in load_state_dict for this scenario.
         self.model_.load_state_dict(
-            checkpoint, strict=False
+            checkpoint,
+            strict=False,
+            assign=True,
         )  # self.model_ = Transformer(gptconf)
 
     def get_eager_model(self):