fix(convert): Add support for permuted kvq bias weights in HF conversion

gabe-l-hart · gabe-l-hart · commit 63fee967d6df · 2024-10-07T20:17:48.000-06:00
Branch: GraniteCodeSupport

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/torchchat/cli/convert_hf_checkpoint.py b/torchchat/cli/convert_hf_checkpoint.py
@@ -100,11 +100,10 @@ def convert_hf_checkpoint(
     bin_files = {model_dir / bin for bin in bin_index["weight_map"].values()}
 
     def permute(w, n_heads):
-        dim = config.dim
         return (
-            w.view(n_heads, 2, config.head_dim // 2, dim)
+            w.view(n_heads, 2, config.head_dim // 2, *w.shape[1:])
             .transpose(1, 2)
-            .reshape(config.head_dim * n_heads, dim)
+            .reshape(w.shape)
         )
 
     merged_result = {}
@@ -137,6 +136,7 @@ def load_safetensors():
                 continue
         assert state_dict is not None, f"Unable to load tensors from {file}"
         merged_result.update(state_dict)
+
     final_result = {}
     for key, value in merged_result.items():
         if "layers" in key:
@@ -150,16 +150,20 @@ def load_safetensors():
         final_result[new_key] = value
 
     for key in tuple(final_result.keys()):
-        if "wq.weight" in key:
+        if "wq.weight" in key or "wq.bias" in key:
+            wk_key = key.replace("wq", "wk")
+            wv_key = key.replace("wq", "wv")
             q = final_result[key]
-            k = final_result[key.replace("wq", "wk")]
-            v = final_result[key.replace("wq", "wv")]
+            k = final_result[wk_key]
+            v = final_result[wv_key]
+            print(key)
             q = permute(q, config.n_heads)
+            print(wk_key)
             k = permute(k, config.n_local_heads)
             final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v])
             del final_result[key]
-            del final_result[key.replace("wq", "wk")]
-            del final_result[key.replace("wq", "wv")]
+            del final_result[wk_key]
+            del final_result[wv_key]
     print(f"Saving checkpoint to {model_dir / 'model.pth'}. This may take a while.")
     torch.save(final_result, model_dir / "model.pth")
     print("Done.")