@@ -326,6 +326,7 @@ def __repr__(self) -> str:
326
326
#
327
327
328
328
def permute (weights : NDArray , n_head : int , n_head_kv : int ) -> NDArray :
329
+ #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
329
330
if n_head_kv is not None and n_head != n_head_kv :
330
331
n_head //= n_head_kv
331
332
return (weights .reshape (n_head , 2 , weights .shape [0 ] // n_head // 2 , * weights .shape [1 :])
@@ -818,12 +819,12 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
818
819
for i in itertools .count ():
819
820
if f"model.layers.{ i } .self_attn.q_proj.weight" in model :
820
821
print (f"Permuting layer { i } " )
821
- tmp [f"model.layers.{ i } .self_attn.q_proj.weight" ] = permute_lazy (model [f"model.layers.{ i } .self_attn.q_proj.weight" ], params .n_head , params .n_head_kv )
822
+ tmp [f"model.layers.{ i } .self_attn.q_proj.weight" ] = permute_lazy (model [f"model.layers.{ i } .self_attn.q_proj.weight" ], params .n_head , params .n_head )
822
823
tmp [f"model.layers.{ i } .self_attn.k_proj.weight" ] = permute_lazy (model [f"model.layers.{ i } .self_attn.k_proj.weight" ], params .n_head , params .n_head_kv )
823
824
#tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
824
825
elif f"model.layers.{ i } .self_attn.W_pack.weight" in model :
825
826
print (f"Unpacking and permuting layer { i } " )
826
- tmp [f"model.layers.{ i } .self_attn.q_proj.weight" ] = permute_part_lazy (model [f"model.layers.{ i } .self_attn.W_pack.weight" ], 0 , params .n_head , params .n_head_kv )
827
+ tmp [f"model.layers.{ i } .self_attn.q_proj.weight" ] = permute_part_lazy (model [f"model.layers.{ i } .self_attn.W_pack.weight" ], 0 , params .n_head , params .n_head )
827
828
tmp [f"model.layers.{ i } .self_attn.k_proj.weight" ] = permute_part_lazy (model [f"model.layers.{ i } .self_attn.W_pack.weight" ], 1 , params .n_head , params .n_head_kv )
828
829
tmp [f"model.layers.{ i } .self_attn.v_proj.weight" ] = part_lazy (model [f"model.layers.{ i } .self_attn.W_pack.weight" ], 2 )
829
830
else :
0 commit comments