@@ -7760,7 +7760,18 @@ struct llm_build_context {
7760
7760
ggml_view_3d (ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1 , n_seqs, x_norm_ffn->nb [1 ], x_norm_ffn->nb [2 ], 0 ),
7761
7761
1
7762
7762
);
7763
- cur = ggml_add (ctx0, cur, llm_build_rwkv6_channel_mix (lctx, ctx0, layer, x_norm_ffn, x_prev));
7763
+
7764
+ struct ggml_tensor * inp_ffn = x_norm_ffn;
7765
+
7766
+ if (il == n_layer - 1 ) {
7767
+ // skip computing output for unused tokens
7768
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
7769
+ inp_ffn = ggml_get_rows (ctx0, x_norm_ffn, inp_out_ids);
7770
+ x_prev = ggml_get_rows (ctx0, x_prev, inp_out_ids);
7771
+ cur = ggml_get_rows (ctx0, cur, inp_out_ids);
7772
+ }
7773
+
7774
+ cur = ggml_add (ctx0, cur, llm_build_rwkv6_channel_mix (lctx, ctx0, layer, inp_ffn, x_prev));
7764
7775
ggml_build_forward_expand (gf, cur);
7765
7776
7766
7777
struct ggml_tensor * last_norm_att = ggml_view_3d (ctx0, x_norm_att, n_embd, 1 , n_seqs, x_norm_att->nb [1 ], x_norm_att->nb [2 ], (n_seq_tokens-1 )*n_embd*ggml_element_size (x_norm_att));
@@ -7789,9 +7800,8 @@ struct llm_build_context {
7789
7800
}
7790
7801
7791
7802
cur = inpL;
7792
- struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
7793
- cur = ggml_reshape_2d (ctx0, cur, n_embd, n_tokens);
7794
- cur = ggml_get_rows (ctx0, cur, inp_out_ids);
7803
+ // struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7804
+ // cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7795
7805
7796
7806
cur = llm_build_norm (ctx0, cur, hparams, model.output_norm , model.output_norm_b , LLM_NORM, cb, -1 );
7797
7807
cb (cur, " result_norm" , -1 );
@@ -7874,6 +7884,13 @@ struct llm_build_context {
7874
7884
7875
7885
cb (ffn_inp, " ffn_inp" , il);
7876
7886
7887
+ if (il == n_layer - 1 ) {
7888
+ // skip computing output for unused tokens
7889
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
7890
+ cur = ggml_get_rows (ctx0, cur, inp_out_ids);
7891
+ ffn_inp = ggml_get_rows (ctx0, ffn_inp, inp_out_ids);
7892
+ }
7893
+
7877
7894
// feed-forward network
7878
7895
cur = llm_build_norm (ctx0, ffn_inp, hparams,
7879
7896
model.layers [il].ffn_norm , NULL ,
@@ -7897,10 +7914,6 @@ struct llm_build_context {
7897
7914
}
7898
7915
7899
7916
cur = inpL;
7900
- struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
7901
- cur = ggml_reshape_2d (ctx0, cur, n_embd, n_tokens);
7902
- cur = ggml_get_rows (ctx0, cur, inp_out_ids);
7903
-
7904
7917
cur = llm_build_norm (ctx0, cur, hparams, model.output_norm , model.output_norm_b , LLM_NORM_RMS, cb, -1 );
7905
7918
cb (cur, " result_norm" , -1 );
7906
7919
@@ -7981,7 +7994,18 @@ struct llm_build_context {
7981
7994
ggml_view_3d (ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1 , n_seqs, x_norm_ffn->nb [1 ], x_norm_ffn->nb [2 ], 0 ),
7982
7995
1
7983
7996
);
7984
- cur = ggml_add (ctx0, cur, llm_build_rwkv7_channel_mix (lctx, ctx0, layer, x_norm_ffn, x_prev));
7997
+
7998
+ struct ggml_tensor * inp_ffn = x_norm_ffn;
7999
+
8000
+ if (il == n_layer - 1 ) {
8001
+ // skip computing output for unused tokens
8002
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
8003
+ inp_ffn = ggml_get_rows (ctx0, x_norm_ffn, inp_out_ids);
8004
+ x_prev = ggml_get_rows (ctx0, x_prev, inp_out_ids);
8005
+ cur = ggml_get_rows (ctx0, cur, inp_out_ids);
8006
+ }
8007
+
8008
+ cur = ggml_add (ctx0, cur, llm_build_rwkv7_channel_mix (lctx, ctx0, layer, inp_ffn, x_prev));
7985
8009
ggml_build_forward_expand (gf, cur);
7986
8010
7987
8011
struct ggml_tensor * last_norm_att = ggml_view_3d (ctx0, x_norm_att, n_embd, 1 , n_seqs, x_norm_att->nb [1 ], x_norm_att->nb [2 ], (n_seq_tokens-1 )*n_embd*ggml_element_size (x_norm_att));
@@ -8010,10 +8034,6 @@ struct llm_build_context {
8010
8034
}
8011
8035
8012
8036
cur = inpL;
8013
- struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
8014
- cur = ggml_reshape_2d (ctx0, cur, n_embd, n_tokens);
8015
- cur = ggml_get_rows (ctx0, cur, inp_out_ids);
8016
-
8017
8037
cur = llm_build_norm (ctx0, cur, hparams, model.output_norm , model.output_norm_b , LLM_NORM, cb, -1 );
8018
8038
cb (cur, " result_norm" , -1 );
8019
8039
@@ -8095,6 +8115,13 @@ struct llm_build_context {
8095
8115
8096
8116
cb (ffn_inp, " ffn_inp" , il);
8097
8117
8118
+ if (il == n_layer - 1 ) {
8119
+ // skip computing output for unused tokens
8120
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
8121
+ cur = ggml_get_rows (ctx0, cur, inp_out_ids);
8122
+ ffn_inp = ggml_get_rows (ctx0, ffn_inp, inp_out_ids);
8123
+ }
8124
+
8098
8125
// feed-forward network
8099
8126
cur = llm_build_norm (ctx0, ffn_inp, hparams,
8100
8127
model.layers [il].ffn_norm , NULL ,
@@ -8118,10 +8145,6 @@ struct llm_build_context {
8118
8145
}
8119
8146
8120
8147
cur = inpL;
8121
- struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
8122
- cur = ggml_reshape_2d (ctx0, cur, n_embd, n_tokens);
8123
- cur = ggml_get_rows (ctx0, cur, inp_out_ids);
8124
-
8125
8148
cur = llm_build_norm (ctx0, cur, hparams, model.output_norm , model.output_norm_b , LLM_NORM_RMS, cb, -1 );
8126
8149
cb (cur, " result_norm" , -1 );
8127
8150
0 commit comments