@@ -44,6 +44,7 @@ class TensorNameMap:
44
44
"transformer.norm" , # openelm
45
45
"rwkv.blocks.0.pre_ln" , # rwkv6
46
46
"model.pre_ln" , # rwkv7
47
+ "model.layers.0.pre_norm" , # rwkv7
47
48
"backbone.norm" , # wavtokenizer
48
49
),
49
50
@@ -126,15 +127,15 @@ class TensorNameMap:
126
127
"encoder.layers.{bid}.input_layernorm" , # chatglm
127
128
"transformer.layers.{bid}.attn_norm" , # openelm
128
129
"rwkv.blocks.{bid}.ln1" , # rwkv6
129
- "model.blocks .{bid}.ln1" , # rwkv7
130
+ "model.layers .{bid}.ln1" , # rwkv7
130
131
),
131
132
132
133
# Attention norm 2
133
134
MODEL_TENSOR .ATTN_NORM_2 : (
134
135
"transformer.h.{bid}.ln_attn" , # falcon40b
135
136
"encoder.layer.{bid}.layer_norm_1" , # jina-v2-code
136
137
"rwkv.blocks.{bid}.ln2" , # rwkv6
137
- "model.blocks .{bid}.ln2" , # rwkv7
138
+ "model.layers .{bid}.ln2" , # rwkv7
138
139
),
139
140
140
141
# Attention query-key-value
@@ -468,77 +469,63 @@ class TensorNameMap:
468
469
),
469
470
470
471
MODEL_TENSOR .TIME_MIX_W0 : (
471
- "model.blocks.{bid}.attention.w0" , # rwkv7
472
- "model.layers.{bid}.self_attn.time_mixer.w0" , # arwkv7
472
+ "model.layers.{bid}.attention.w0" , # rwkv7
473
473
),
474
474
475
475
MODEL_TENSOR .TIME_MIX_W1 : (
476
476
"rwkv.blocks.{bid}.attention.time_maa_w1" , # rwkv6
477
477
"model.layers.{bid}.self_attn.time_maa_w1" , # rwkv6qwen2
478
- "model.blocks.{bid}.attention.w1" , # rwkv7
479
- "model.layers.{bid}.self_attn.time_mixer.w1" , # arwkv7
478
+ "model.layers.{bid}.attention.w1" , # rwkv7
480
479
),
481
480
482
481
MODEL_TENSOR .TIME_MIX_W2 : (
483
482
"rwkv.blocks.{bid}.attention.time_maa_w2" , # rwkv6
484
483
"model.layers.{bid}.self_attn.time_maa_w2" , # rwkv6qwen2
485
- "model.blocks.{bid}.attention.w2" , # rwkv7
486
- "model.layers.{bid}.self_attn.time_mixer.w2" , # arwkv7
484
+ "model.layers.{bid}.attention.w2" , # rwkv7
487
485
),
488
486
489
487
MODEL_TENSOR .TIME_MIX_A0 : (
490
- "model.blocks.{bid}.attention.a0" , # rwkv7
491
- "model.layers.{bid}.self_attn.time_mixer.a0" , # arwkv7
488
+ "model.layers.{bid}.attention.a0" , # rwkv7
492
489
),
493
490
494
491
MODEL_TENSOR .TIME_MIX_A1 : (
495
- "model.blocks.{bid}.attention.a1" , # rwkv7
496
- "model.layers.{bid}.self_attn.time_mixer.a1" , # arwkv7
492
+ "model.layers.{bid}.attention.a1" , # rwkv7
497
493
),
498
494
499
495
MODEL_TENSOR .TIME_MIX_A2 : (
500
- "model.blocks.{bid}.attention.a2" , # rwkv7
501
- "model.layers.{bid}.self_attn.time_mixer.a2" , # arwkv7
496
+ "model.layers.{bid}.attention.a2" , # rwkv7
502
497
),
503
498
504
499
MODEL_TENSOR .TIME_MIX_V0 : (
505
- "model.blocks.{bid}.attention.v0" , # rwkv7
506
- "model.layers.{bid}.self_attn.time_mixer.v0" , # arwkv7
500
+ "model.layers.{bid}.attention.v0" , # rwkv7
507
501
),
508
502
509
503
MODEL_TENSOR .TIME_MIX_V1 : (
510
- "model.blocks.{bid}.attention.v1" , # rwkv7
511
- "model.layers.{bid}.self_attn.time_mixer.v1" , # arwkv7
504
+ "model.layers.{bid}.attention.v1" , # rwkv7
512
505
),
513
506
514
507
MODEL_TENSOR .TIME_MIX_V2 : (
515
- "model.blocks.{bid}.attention.v2" , # rwkv7
516
- "model.layers.{bid}.self_attn.time_mixer.v2" , # arwkv7
508
+ "model.layers.{bid}.attention.v2" , # rwkv7
517
509
),
518
510
519
511
MODEL_TENSOR .TIME_MIX_G1 : (
520
- "model.blocks.{bid}.attention.g1" , # rwkv7
521
- "model.layers.{bid}.self_attn.time_mixer.g1" , # arwkv7
512
+ "model.layers.{bid}.attention.g1" , # rwkv7
522
513
),
523
514
524
515
MODEL_TENSOR .TIME_MIX_G2 : (
525
- "model.blocks.{bid}.attention.g2" , # rwkv7
526
- "model.layers.{bid}.self_attn.time_mixer.g2" , # arwkv7
516
+ "model.layers.{bid}.attention.g2" , # rwkv7
527
517
),
528
518
529
519
MODEL_TENSOR .TIME_MIX_K_K : (
530
- "model.blocks.{bid}.attention.k_k" , # rwkv7
531
- "model.layers.{bid}.self_attn.time_mixer.k_k" , # arwkv7
520
+ "model.layers.{bid}.attention.k_k" , # rwkv7
532
521
),
533
522
534
523
MODEL_TENSOR .TIME_MIX_K_A : (
535
- "model.blocks.{bid}.attention.k_a" , # rwkv7
536
- "model.layers.{bid}.self_attn.time_mixer.k_a" , # arwkv7
524
+ "model.layers.{bid}.attention.k_a" , # rwkv7
537
525
),
538
526
539
527
MODEL_TENSOR .TIME_MIX_R_K : (
540
- "model.blocks.{bid}.attention.r_k" , # rwkv7
541
- "model.layers.{bid}.self_attn.time_mixer.r_k" , # arwkv7
528
+ "model.layers.{bid}.attention.r_k" , # rwkv7
542
529
),
543
530
544
531
MODEL_TENSOR .TIME_MIX_LERP_X : (
@@ -591,47 +578,46 @@ class TensorNameMap:
591
578
),
592
579
593
580
MODEL_TENSOR .TIME_MIX_KEY : (
594
- "rwkv.blocks.{bid}.attention.key" , # rwkv6
595
- "model.layers.{bid}.self_attn.k_proj" , # rwkv6qwen2
596
- "model.blocks .{bid}.attention.key" , # rwkv7
597
- "model.layers.{bid}.self_attn.time_mixer.key.weight " , # arwkv7
581
+ "rwkv.blocks.{bid}.attention.key" , # rwkv6
582
+ "model.layers.{bid}.self_attn.k_proj" , # rwkv6qwen2
583
+ "model.layers .{bid}.attention.key" , # rwkv7
584
+ "model.layers.{bid}.attention.k_proj " , # rwkv7
598
585
),
599
586
600
587
MODEL_TENSOR .TIME_MIX_VALUE : (
601
- "rwkv.blocks.{bid}.attention.value" , # rwkv6
602
- "model.layers.{bid}.self_attn.v_proj" , # rwkv6qwen2
603
- "model.blocks .{bid}.attention.value" , # rwkv7
604
- "model.layers.{bid}.self_attn.time_mixer.value.weight " , # arwkv7
588
+ "rwkv.blocks.{bid}.attention.value" , # rwkv6
589
+ "model.layers.{bid}.self_attn.v_proj" , # rwkv6qwen2
590
+ "model.layers .{bid}.attention.value" , # rwkv7
591
+ "model.layers.{bid}.attention.v_proj " , # rwkv7
605
592
),
606
593
607
594
MODEL_TENSOR .TIME_MIX_RECEPTANCE : (
608
- "rwkv.blocks.{bid}.attention.receptance" , # rwkv6
609
- "model.layers.{bid}.self_attn.q_proj" , # rwkv6qwen2
610
- "model.blocks .{bid}.attention.receptance" , # rwkv7
611
- "model.layers.{bid}.self_attn.time_mixer.receptance.weight " , # arwkv7
595
+ "rwkv.blocks.{bid}.attention.receptance" , # rwkv6
596
+ "model.layers.{bid}.self_attn.q_proj" , # rwkv6qwen2
597
+ "model.layers .{bid}.attention.receptance" , # rwkv7
598
+ "model.layers.{bid}.attention.r_proj " , # rwkv7
612
599
),
613
600
614
601
MODEL_TENSOR .TIME_MIX_GATE : (
615
602
"rwkv.blocks.{bid}.attention.gate" , # rwkv6
616
603
"model.layers.{bid}.self_attn.gate" , # rwkv6qwen2
617
- "model.layers.{bid}.self_attn.time_mixer.gate.weight" , # arwkv7
618
604
),
619
605
620
606
MODEL_TENSOR .TIME_MIX_LN : (
621
607
"rwkv.blocks.{bid}.attention.ln_x" , # rwkv6
622
- "model.blocks .{bid}.attention.ln_x" # rwkv7
608
+ "model.layers .{bid}.attention.ln_x" # rwkv7
623
609
),
624
610
625
611
MODEL_TENSOR .TIME_MIX_OUTPUT : (
626
- "rwkv.blocks.{bid}.attention.output" , # rwkv
627
- "model.layers.{bid}.self_attn.o_proj" , # rwkv6qwen2
628
- "model.blocks .{bid}.attention.output" , # rwkv7
629
- "model.layers.{bid}.self_attn.time_mixer.output.weight " , # arwkv7
612
+ "rwkv.blocks.{bid}.attention.output" , # rwkv
613
+ "model.layers.{bid}.self_attn.o_proj" , # rwkv6qwen2
614
+ "model.layers .{bid}.attention.output" , # rwkv7
615
+ "model.layers.{bid}.attention.o_proj " , # rwkv7
630
616
),
631
617
632
618
MODEL_TENSOR .CHANNEL_MIX_LERP_K : (
633
619
"rwkv.blocks.{bid}.feed_forward.time_maa_k" , # rwkv6
634
- "model.blocks .{bid}.feed_forward.x_k" , # rwkv7
620
+ "model.layers .{bid}.feed_forward.x_k" , # rwkv7
635
621
),
636
622
637
623
MODEL_TENSOR .CHANNEL_MIX_LERP_R : (
@@ -640,7 +626,7 @@ class TensorNameMap:
640
626
641
627
MODEL_TENSOR .CHANNEL_MIX_KEY : (
642
628
"rwkv.blocks.{bid}.feed_forward.key" , # rwkv6
643
- "model.blocks .{bid}.feed_forward.key" , # rwkv7
629
+ "model.layers .{bid}.feed_forward.key" , # rwkv7
644
630
),
645
631
646
632
MODEL_TENSOR .CHANNEL_MIX_RECEPTANCE : (
@@ -649,7 +635,7 @@ class TensorNameMap:
649
635
650
636
MODEL_TENSOR .CHANNEL_MIX_VALUE : (
651
637
"rwkv.blocks.{bid}.feed_forward.value" , # rwkv6
652
- "model.blocks .{bid}.feed_forward.value" , # rwkv7
638
+ "model.layers .{bid}.feed_forward.value" , # rwkv7
653
639
),
654
640
655
641
MODEL_TENSOR .ATTN_Q_A : (
0 commit comments