Skip to content

Commit 16a8acd

Browse files
committed
rwkv7: converter script simplification
Signed-off-by: Molly Sophia <[email protected]>
1 parent e9c6311 commit 16a8acd

File tree

3 files changed

+98
-104
lines changed

3 files changed

+98
-104
lines changed

convert_hf_to_gguf.py

Lines changed: 51 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -3480,7 +3480,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
34803480
yield (new_name, data)
34813481

34823482

3483-
@Model.register("Rwkv7ForCausalLM")
3483+
@Model.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM")
34843484
class Rwkv7Model(Rwkv6Model):
34853485
model_arch = gguf.MODEL_ARCH.RWKV7
34863486

@@ -3489,16 +3489,26 @@ def calc_lora_rank(self, hidden_size, exponent, multiplier):
34893489

34903490
def set_gguf_parameters(self):
34913491
block_count = self.hparams["num_hidden_layers"]
3492-
head_size = self.hparams["head_size"]
3492+
try:
3493+
head_size = self.hparams["head_size"]
3494+
layer_norm_eps = self.hparams["layer_norm_epsilon"]
3495+
except KeyError:
3496+
head_size = self.hparams["head_dim"]
3497+
layer_norm_eps = self.hparams["norm_eps"]
34933498
hidden_size = self.hparams["hidden_size"]
3494-
layer_norm_eps = self.hparams["layer_norm_epsilon"]
34953499
intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else (hidden_size * 4)
34963500

34973501
# ICLR: In-Context-Learning-Rate
3498-
lora_rank_decay = self.hparams["lora_rank_decay"] if self.hparams["lora_rank_decay"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
3499-
lora_rank_iclr = self.hparams["lora_rank_iclr"] if self.hparams["lora_rank_iclr"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
3500-
lora_rank_value_residual_mix = self.hparams["lora_rank_value_residual_mix"] if self.hparams["lora_rank_value_residual_mix"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3)
3501-
lora_rank_gate = self.hparams["lora_rank_gate"] if self.hparams["lora_rank_gate"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6)
3502+
try:
3503+
lora_rank_decay = self.hparams["lora_rank_decay"] if self.hparams["lora_rank_decay"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
3504+
lora_rank_iclr = self.hparams["lora_rank_iclr"] if self.hparams["lora_rank_iclr"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
3505+
lora_rank_value_residual_mix = self.hparams["lora_rank_value_residual_mix"] if self.hparams["lora_rank_value_residual_mix"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3)
3506+
lora_rank_gate = self.hparams["lora_rank_gate"] if self.hparams["lora_rank_gate"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6)
3507+
except KeyError:
3508+
lora_rank_decay = self.hparams["decay_low_rank_dim"] if self.hparams["decay_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
3509+
lora_rank_iclr = self.hparams["a_low_rank_dim"] if self.hparams["a_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
3510+
lora_rank_value_residual_mix = self.hparams["v_low_rank_dim"] if self.hparams["v_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3)
3511+
lora_rank_gate = self.hparams["gate_low_rank_dim"] if self.hparams["gate_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6)
35023512

35033513
# RWKV isn't context limited
35043514
self.gguf_writer.add_context_length(1048576)
@@ -3517,17 +3527,43 @@ def set_gguf_parameters(self):
35173527
self.gguf_writer.add_head_count(0)
35183528

35193529
lerp_weights: dict[int, dict[str, Tensor]] = {}
3530+
lora_needs_transpose: bool = True
35203531

35213532
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3533+
# unify tensor names here to make life easier
3534+
name = name.replace("blocks", "layers").replace("ffn", "feed_forward")
3535+
name = name.replace("self_attn", "attention").replace("attn", "attention")
3536+
name = name.replace("time_mixer.", "")
3537+
# lora layer names in fla-hub's impl
3538+
if "_lora.lora" in name:
3539+
self.lora_needs_transpose = False
3540+
name = name.replace("_lora.lora.0.weight", "1.weight")
3541+
name = name.replace("_lora.lora.2.weight", "2.weight")
3542+
name = name.replace("_lora.lora.2.bias", "0.weight")
3543+
3544+
name = name.replace("feed_forward_norm", "ln2")
3545+
name = name.replace("g_norm", "ln_x")
3546+
3547+
if "attention.v" in name and (not "value" in self.map_tensor_name(name)) and bid == 0:
3548+
# some models have dummy v0/v1/v2 on first layer while others don't
3549+
# ignore them all since they are not used
3550+
return
3551+
35223552
if bid is not None and "attention.x_" in name:
3523-
try:
3524-
self.lerp_weights[bid][name] = data_torch
3525-
except KeyError:
3526-
self.lerp_weights[bid] = {name: data_torch}
3527-
if all(f"model.blocks.{bid}.attention.x_{i}" in self.lerp_weights[bid].keys() for i in ["r", "w", "k", "v", "a", "g"]):
3553+
if "attention.x_x" in name:
3554+
# already concatenated
35283555
new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
3529-
data = torch.stack([self.lerp_weights[bid][f"model.blocks.{bid}.attention.x_{i}"].squeeze(0) for i in ["r", "w", "k", "v", "a", "g"]], dim=0)
3556+
data = data_torch.reshape(6, 1, -1)
35303557
yield (new_name, data)
3558+
else:
3559+
try:
3560+
self.lerp_weights[bid][name] = data_torch
3561+
except KeyError:
3562+
self.lerp_weights[bid] = {name: data_torch}
3563+
if all(f"model.layers.{bid}.attention.x_{i}" in self.lerp_weights[bid].keys() for i in ["r", "w", "k", "v", "a", "g"]):
3564+
new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
3565+
data = torch.stack([self.lerp_weights[bid][f"model.layers.{bid}.attention.x_{i}"].squeeze(0) for i in ["r", "w", "k", "v", "a", "g"]], dim=0)
3566+
yield (new_name, data)
35313567
return
35323568
else:
35333569
data_torch = data_torch.squeeze()
@@ -3536,7 +3572,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
35363572
if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
35373573
new_name += ".weight"
35383574

3539-
if any(
3575+
if self.lora_needs_transpose and any(
35403576
new_name.endswith(t) for t in [
35413577
"time_mix_w1.weight", "time_mix_w2.weight",
35423578
"time_mix_a1.weight", "time_mix_a2.weight",
@@ -3558,7 +3594,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
35583594

35593595

35603596
@Model.register("RwkvHybridForCausalLM")
3561-
class ARwkv7Model(Model):
3597+
class ARwkv7Model(Rwkv7Model):
35623598
model_arch = gguf.MODEL_ARCH.ARWKV7
35633599

35643600
def set_vocab(self):
@@ -3599,41 +3635,6 @@ def set_gguf_parameters(self):
35993635
# required by llama.cpp, unused
36003636
self.gguf_writer.add_head_count(0)
36013637

3602-
lerp_weights: dict[int, dict[str, Tensor]] = {}
3603-
3604-
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3605-
if bid is not None and "self_attn.time_mixer.x_" in name:
3606-
try:
3607-
self.lerp_weights[bid][name] = data_torch
3608-
except KeyError:
3609-
self.lerp_weights[bid] = {name: data_torch}
3610-
if all(f"model.layers.{bid}.self_attn.time_mixer.x_{i}" in self.lerp_weights[bid].keys() for i in ["r", "w", "k", "v", "a", "g"]):
3611-
new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
3612-
data = torch.stack([self.lerp_weights[bid][f"model.layers.{bid}.self_attn.time_mixer.x_{i}"].squeeze(0) for i in ["r", "w", "k", "v", "a", "g"]], dim=0)
3613-
yield (new_name, data)
3614-
return
3615-
else:
3616-
data_torch = data_torch.squeeze()
3617-
new_name = self.map_tensor_name(name)
3618-
3619-
if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
3620-
new_name += ".weight"
3621-
3622-
if any(
3623-
new_name.endswith(t) for t in [
3624-
"time_mix_w1.weight", "time_mix_w2.weight",
3625-
"time_mix_a1.weight", "time_mix_a2.weight",
3626-
"time_mix_v1.weight", "time_mix_v2.weight",
3627-
"time_mix_g1.weight", "time_mix_g2.weight",
3628-
]
3629-
):
3630-
data_torch = data_torch.transpose(0, 1)
3631-
3632-
if 'r_k' in new_name:
3633-
data_torch = data_torch.flatten()
3634-
3635-
yield (new_name, data_torch)
3636-
36373638

36383639
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
36393640
class MambaModel(Model):

gguf-py/gguf/tensor_mapping.py

Lines changed: 37 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ class TensorNameMap:
4444
"transformer.norm", # openelm
4545
"rwkv.blocks.0.pre_ln", # rwkv6
4646
"model.pre_ln", # rwkv7
47+
"model.layers.0.pre_norm", # rwkv7
4748
"backbone.norm", # wavtokenizer
4849
),
4950

@@ -126,15 +127,15 @@ class TensorNameMap:
126127
"encoder.layers.{bid}.input_layernorm", # chatglm
127128
"transformer.layers.{bid}.attn_norm", # openelm
128129
"rwkv.blocks.{bid}.ln1", # rwkv6
129-
"model.blocks.{bid}.ln1", # rwkv7
130+
"model.layers.{bid}.ln1", # rwkv7
130131
),
131132

132133
# Attention norm 2
133134
MODEL_TENSOR.ATTN_NORM_2: (
134135
"transformer.h.{bid}.ln_attn", # falcon40b
135136
"encoder.layer.{bid}.layer_norm_1", # jina-v2-code
136137
"rwkv.blocks.{bid}.ln2", # rwkv6
137-
"model.blocks.{bid}.ln2", # rwkv7
138+
"model.layers.{bid}.ln2", # rwkv7
138139
),
139140

140141
# Attention query-key-value
@@ -468,77 +469,63 @@ class TensorNameMap:
468469
),
469470

470471
MODEL_TENSOR.TIME_MIX_W0: (
471-
"model.blocks.{bid}.attention.w0", # rwkv7
472-
"model.layers.{bid}.self_attn.time_mixer.w0", # arwkv7
472+
"model.layers.{bid}.attention.w0", # rwkv7
473473
),
474474

475475
MODEL_TENSOR.TIME_MIX_W1: (
476476
"rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv6
477477
"model.layers.{bid}.self_attn.time_maa_w1", # rwkv6qwen2
478-
"model.blocks.{bid}.attention.w1", # rwkv7
479-
"model.layers.{bid}.self_attn.time_mixer.w1", # arwkv7
478+
"model.layers.{bid}.attention.w1", # rwkv7
480479
),
481480

482481
MODEL_TENSOR.TIME_MIX_W2: (
483482
"rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv6
484483
"model.layers.{bid}.self_attn.time_maa_w2", # rwkv6qwen2
485-
"model.blocks.{bid}.attention.w2", # rwkv7
486-
"model.layers.{bid}.self_attn.time_mixer.w2", # arwkv7
484+
"model.layers.{bid}.attention.w2", # rwkv7
487485
),
488486

489487
MODEL_TENSOR.TIME_MIX_A0: (
490-
"model.blocks.{bid}.attention.a0", # rwkv7
491-
"model.layers.{bid}.self_attn.time_mixer.a0", # arwkv7
488+
"model.layers.{bid}.attention.a0", # rwkv7
492489
),
493490

494491
MODEL_TENSOR.TIME_MIX_A1: (
495-
"model.blocks.{bid}.attention.a1", # rwkv7
496-
"model.layers.{bid}.self_attn.time_mixer.a1", # arwkv7
492+
"model.layers.{bid}.attention.a1", # rwkv7
497493
),
498494

499495
MODEL_TENSOR.TIME_MIX_A2: (
500-
"model.blocks.{bid}.attention.a2", # rwkv7
501-
"model.layers.{bid}.self_attn.time_mixer.a2", # arwkv7
496+
"model.layers.{bid}.attention.a2", # rwkv7
502497
),
503498

504499
MODEL_TENSOR.TIME_MIX_V0: (
505-
"model.blocks.{bid}.attention.v0", # rwkv7
506-
"model.layers.{bid}.self_attn.time_mixer.v0", # arwkv7
500+
"model.layers.{bid}.attention.v0", # rwkv7
507501
),
508502

509503
MODEL_TENSOR.TIME_MIX_V1: (
510-
"model.blocks.{bid}.attention.v1", # rwkv7
511-
"model.layers.{bid}.self_attn.time_mixer.v1", # arwkv7
504+
"model.layers.{bid}.attention.v1", # rwkv7
512505
),
513506

514507
MODEL_TENSOR.TIME_MIX_V2: (
515-
"model.blocks.{bid}.attention.v2", # rwkv7
516-
"model.layers.{bid}.self_attn.time_mixer.v2", # arwkv7
508+
"model.layers.{bid}.attention.v2", # rwkv7
517509
),
518510

519511
MODEL_TENSOR.TIME_MIX_G1: (
520-
"model.blocks.{bid}.attention.g1", # rwkv7
521-
"model.layers.{bid}.self_attn.time_mixer.g1", # arwkv7
512+
"model.layers.{bid}.attention.g1", # rwkv7
522513
),
523514

524515
MODEL_TENSOR.TIME_MIX_G2: (
525-
"model.blocks.{bid}.attention.g2", # rwkv7
526-
"model.layers.{bid}.self_attn.time_mixer.g2", # arwkv7
516+
"model.layers.{bid}.attention.g2", # rwkv7
527517
),
528518

529519
MODEL_TENSOR.TIME_MIX_K_K: (
530-
"model.blocks.{bid}.attention.k_k", # rwkv7
531-
"model.layers.{bid}.self_attn.time_mixer.k_k", # arwkv7
520+
"model.layers.{bid}.attention.k_k", # rwkv7
532521
),
533522

534523
MODEL_TENSOR.TIME_MIX_K_A: (
535-
"model.blocks.{bid}.attention.k_a", # rwkv7
536-
"model.layers.{bid}.self_attn.time_mixer.k_a", # arwkv7
524+
"model.layers.{bid}.attention.k_a", # rwkv7
537525
),
538526

539527
MODEL_TENSOR.TIME_MIX_R_K: (
540-
"model.blocks.{bid}.attention.r_k", # rwkv7
541-
"model.layers.{bid}.self_attn.time_mixer.r_k", # arwkv7
528+
"model.layers.{bid}.attention.r_k", # rwkv7
542529
),
543530

544531
MODEL_TENSOR.TIME_MIX_LERP_X: (
@@ -591,47 +578,46 @@ class TensorNameMap:
591578
),
592579

593580
MODEL_TENSOR.TIME_MIX_KEY: (
594-
"rwkv.blocks.{bid}.attention.key", # rwkv6
595-
"model.layers.{bid}.self_attn.k_proj", # rwkv6qwen2
596-
"model.blocks.{bid}.attention.key", # rwkv7
597-
"model.layers.{bid}.self_attn.time_mixer.key.weight", # arwkv7
581+
"rwkv.blocks.{bid}.attention.key", # rwkv6
582+
"model.layers.{bid}.self_attn.k_proj", # rwkv6qwen2
583+
"model.layers.{bid}.attention.key", # rwkv7
584+
"model.layers.{bid}.attention.k_proj", # rwkv7
598585
),
599586

600587
MODEL_TENSOR.TIME_MIX_VALUE: (
601-
"rwkv.blocks.{bid}.attention.value", # rwkv6
602-
"model.layers.{bid}.self_attn.v_proj", # rwkv6qwen2
603-
"model.blocks.{bid}.attention.value", # rwkv7
604-
"model.layers.{bid}.self_attn.time_mixer.value.weight", # arwkv7
588+
"rwkv.blocks.{bid}.attention.value", # rwkv6
589+
"model.layers.{bid}.self_attn.v_proj", # rwkv6qwen2
590+
"model.layers.{bid}.attention.value", # rwkv7
591+
"model.layers.{bid}.attention.v_proj", # rwkv7
605592
),
606593

607594
MODEL_TENSOR.TIME_MIX_RECEPTANCE: (
608-
"rwkv.blocks.{bid}.attention.receptance", # rwkv6
609-
"model.layers.{bid}.self_attn.q_proj", # rwkv6qwen2
610-
"model.blocks.{bid}.attention.receptance", # rwkv7
611-
"model.layers.{bid}.self_attn.time_mixer.receptance.weight", # arwkv7
595+
"rwkv.blocks.{bid}.attention.receptance", # rwkv6
596+
"model.layers.{bid}.self_attn.q_proj", # rwkv6qwen2
597+
"model.layers.{bid}.attention.receptance", # rwkv7
598+
"model.layers.{bid}.attention.r_proj", # rwkv7
612599
),
613600

614601
MODEL_TENSOR.TIME_MIX_GATE: (
615602
"rwkv.blocks.{bid}.attention.gate", # rwkv6
616603
"model.layers.{bid}.self_attn.gate", # rwkv6qwen2
617-
"model.layers.{bid}.self_attn.time_mixer.gate.weight", # arwkv7
618604
),
619605

620606
MODEL_TENSOR.TIME_MIX_LN: (
621607
"rwkv.blocks.{bid}.attention.ln_x", # rwkv6
622-
"model.blocks.{bid}.attention.ln_x" # rwkv7
608+
"model.layers.{bid}.attention.ln_x" # rwkv7
623609
),
624610

625611
MODEL_TENSOR.TIME_MIX_OUTPUT: (
626-
"rwkv.blocks.{bid}.attention.output", # rwkv
627-
"model.layers.{bid}.self_attn.o_proj", # rwkv6qwen2
628-
"model.blocks.{bid}.attention.output", # rwkv7
629-
"model.layers.{bid}.self_attn.time_mixer.output.weight", # arwkv7
612+
"rwkv.blocks.{bid}.attention.output", # rwkv
613+
"model.layers.{bid}.self_attn.o_proj", # rwkv6qwen2
614+
"model.layers.{bid}.attention.output", # rwkv7
615+
"model.layers.{bid}.attention.o_proj", # rwkv7
630616
),
631617

632618
MODEL_TENSOR.CHANNEL_MIX_LERP_K: (
633619
"rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv6
634-
"model.blocks.{bid}.feed_forward.x_k", # rwkv7
620+
"model.layers.{bid}.feed_forward.x_k", # rwkv7
635621
),
636622

637623
MODEL_TENSOR.CHANNEL_MIX_LERP_R: (
@@ -640,7 +626,7 @@ class TensorNameMap:
640626

641627
MODEL_TENSOR.CHANNEL_MIX_KEY: (
642628
"rwkv.blocks.{bid}.feed_forward.key", # rwkv6
643-
"model.blocks.{bid}.feed_forward.key", # rwkv7
629+
"model.layers.{bid}.feed_forward.key", # rwkv7
644630
),
645631

646632
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: (
@@ -649,7 +635,7 @@ class TensorNameMap:
649635

650636
MODEL_TENSOR.CHANNEL_MIX_VALUE: (
651637
"rwkv.blocks.{bid}.feed_forward.value", # rwkv6
652-
"model.blocks.{bid}.feed_forward.value", # rwkv7
638+
"model.layers.{bid}.feed_forward.value", # rwkv7
653639
),
654640

655641
MODEL_TENSOR.ATTN_Q_A: (

src/llama-model.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3374,9 +3374,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
33743374
layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
33753375
layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
33763376

3377-
layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
3378-
layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
3379-
layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
3377+
if (i == 0) {
3378+
// actually not used
3379+
layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
3380+
layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
3381+
layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
3382+
} else {
3383+
layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
3384+
layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
3385+
layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
3386+
}
33803387

33813388
layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, llama_model_loader::TENSOR_NOT_REQUIRED);
33823389
layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);

0 commit comments

Comments
 (0)