Fix loading checkpoints with fused wqkv weights (#158)

guangy10 · malfet · commit 741d7ba084ca · 2024-07-16T23:03:10.000-07:00
diff --git a/model.py b/model.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 from dataclasses import dataclass
-from typing import Optional
+from typing import Dict, Optional
 
 import torch
 import torch.nn as nn
@@ -35,7 +35,7 @@ class ModelArgs:
     norm_eps: float = 1e-5
     multiple_of = 256
     ffn_dim_multiplier = None
-    
+
     def __post_init__(self):
         if self.n_local_heads == -1:
             self.n_local_heads = self.n_heads
@@ -56,7 +56,7 @@ def from_params(cls, params_path):
         with open(params_path, "r") as f:
             params = json.loads(f.read())
         return cls(**params)
-    
+
     @classmethod
     def from_name(cls, name: str):
         print(f"name {name}")
@@ -221,7 +221,7 @@ def from_name(cls, name: str):
     @classmethod
     def from_params(cls, params_path: str):
         return cls(ModelArgs.from_params(params_path))
-        
+
 
 class TransformerBlock(nn.Module):
     def __init__(self, config: ModelArgs) -> None:
@@ -258,14 +258,33 @@ def __init__(self, config: ModelArgs):
         self.head_dim = config.head_dim
         self.n_local_heads = config.n_local_heads
         self.dim = config.dim
-        # self._register_load_state_dict_pre_hook(self.load_hook)
-
-    # def load_hook(self, state_dict, prefix, *args):
-    #     if prefix + "wq.weight" in state_dict:
-    #         wq = state_dict.pop(prefix + "wq.weight")
-    #         wk = state_dict.pop(prefix + "wk.weight")
-    #         wv = state_dict.pop(prefix + "wv.weight")
-    #         state_dict[prefix + "wqkv.weight"] = torch.cat([wq, wk, wv])
+        self._register_load_state_dict_pre_hook(self.load_hook)
+
+    def load_hook(self, state_dict, prefix, *args):
+        # if prefix + "wq.weight" in state_dict:
+        #     wq = state_dict.pop(prefix + "wq.weight")
+        #     wk = state_dict.pop(prefix + "wk.weight")
+        #     wv = state_dict.pop(prefix + "wv.weight")
+        #     state_dict[prefix + "wqkv.weight"] = torch.cat([wq, wk, wv])
+
+        def _unfuse_wqkv_state_dict(
+            state_dict: Dict[str, torch.Tensor],
+            dim: int,
+        ):
+            for key in list(state_dict):
+                if key.endswith("wqkv.weight"):
+                    tensor = state_dict[key]
+                    wq_key = key.replace("wqkv.weight", "wq.weight")
+                    state_dict[wq_key] = tensor[: dim]
+                    wk_key = key.replace("wqkv.weight", "wk.weight")
+                    wv_key = key.replace("wqkv.weight", "wv.weight")
+                    wk, wv = tensor[dim :].chunk(2, 0)
+                    state_dict[wk_key] = wk
+                    state_dict[wv_key] = wv
+                    state_dict.pop(key)
+                else:
+                    continue
+        _unfuse_wqkv_state_dict(state_dict, self.dim)
 
     def forward(
         self,
diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py
@@ -55,12 +55,12 @@ def convert_hf_checkpoint(
     }
     bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()}
 
-    def permute(w, n_head):
+    def permute(w, n_heads):
         dim = config.dim
         return (
-            w.view(n_head, 2, config.head_dim // 2, dim)
+            w.view(n_heads, 2, config.head_dim // 2, dim)
             .transpose(1, 2)
-            .reshape(config.head_dim * n_head, dim)
+            .reshape(config.head_dim * n_heads, dim)
         )
 
     merged_result = {}
@@ -86,7 +86,7 @@ def permute(w, n_head):
             q = final_result[key]
             k = final_result[key.replace("wq", "wk")]
             v = final_result[key.replace("wq", "wv")]
-            q = permute(q, config.n_head)
+            q = permute(q, config.n_heads)
             k = permute(k, config.n_local_heads)
             final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v])
             del final_result[key]
diff --git a/scripts/test_flow.sh b/scripts/test_flow.sh
@@ -2,4 +2,4 @@ export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
 rm -r checkpoints/$MODEL_REPO
 python scripts/download.py --repo-id $MODEL_REPO
 python scripts/convert_hf_checkpoint.py --checkpoint-dir checkpoints/$MODEL_REPO
-python generate.py --compile --checkpoint-path checkpoints/$MODEL_REPO/model.pth --max_new_tokens 100
+python generate.py --compile --checkpoint-path checkpoints/$MODEL_REPO/model.pth --max-new-tokens 100