@@ -7745,7 +7745,18 @@ struct llm_build_context {
7745
7745
ggml_view_3d (ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1 , n_seqs, x_norm_ffn->nb [1 ], x_norm_ffn->nb [2 ], 0 ),
7746
7746
1
7747
7747
);
7748
- cur = ggml_add (ctx0, cur, llm_build_rwkv6_channel_mix (lctx, ctx0, layer, x_norm_ffn, x_prev));
7748
+
7749
+ struct ggml_tensor * inp_ffn = x_norm_ffn;
7750
+
7751
+ if (il == n_layer - 1 ) {
7752
+ // skip computing output for unused tokens
7753
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
7754
+ inp_ffn = ggml_get_rows (ctx0, x_norm_ffn, inp_out_ids);
7755
+ x_prev = ggml_get_rows (ctx0, x_prev, inp_out_ids);
7756
+ cur = ggml_get_rows (ctx0, cur, inp_out_ids);
7757
+ }
7758
+
7759
+ cur = ggml_add (ctx0, cur, llm_build_rwkv6_channel_mix (lctx, ctx0, layer, inp_ffn, x_prev));
7749
7760
ggml_build_forward_expand (gf, cur);
7750
7761
7751
7762
struct ggml_tensor * last_norm_att = ggml_view_3d (ctx0, x_norm_att, n_embd, 1 , n_seqs, x_norm_att->nb [1 ], x_norm_att->nb [2 ], (n_seq_tokens-1 )*n_embd*ggml_element_size (x_norm_att));
@@ -7774,9 +7785,8 @@ struct llm_build_context {
7774
7785
}
7775
7786
7776
7787
cur = inpL;
7777
- struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
7778
- cur = ggml_reshape_2d (ctx0, cur, n_embd, n_tokens);
7779
- cur = ggml_get_rows (ctx0, cur, inp_out_ids);
7788
+ // struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7789
+ // cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7780
7790
7781
7791
cur = llm_build_norm (ctx0, cur, hparams, model.output_norm , model.output_norm_b , LLM_NORM, cb, -1 );
7782
7792
cb (cur, " result_norm" , -1 );
@@ -7863,6 +7873,13 @@ struct llm_build_context {
7863
7873
7864
7874
cb (ffn_inp, " ffn_inp" , il);
7865
7875
7876
+ if (il == n_layer - 1 ) {
7877
+ // skip computing output for unused tokens
7878
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
7879
+ cur = ggml_get_rows (ctx0, cur, inp_out_ids);
7880
+ ffn_inp = ggml_get_rows (ctx0, ffn_inp, inp_out_ids);
7881
+ }
7882
+
7866
7883
// feed-forward network
7867
7884
cur = llm_build_norm (ctx0, ffn_inp, hparams,
7868
7885
model.layers [il].ffn_norm , NULL ,
@@ -7886,10 +7903,6 @@ struct llm_build_context {
7886
7903
}
7887
7904
7888
7905
cur = inpL;
7889
- struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
7890
- cur = ggml_reshape_2d (ctx0, cur, n_embd, n_tokens);
7891
- cur = ggml_get_rows (ctx0, cur, inp_out_ids);
7892
-
7893
7906
cur = llm_build_norm (ctx0, cur, hparams, model.output_norm , model.output_norm_b , LLM_NORM_RMS, cb, -1 );
7894
7907
cb (cur, " result_norm" , -1 );
7895
7908
@@ -7970,7 +7983,18 @@ struct llm_build_context {
7970
7983
ggml_view_3d (ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1 , n_seqs, x_norm_ffn->nb [1 ], x_norm_ffn->nb [2 ], 0 ),
7971
7984
1
7972
7985
);
7973
- cur = ggml_add (ctx0, cur, llm_build_rwkv7_channel_mix (lctx, ctx0, layer, x_norm_ffn, x_prev));
7986
+
7987
+ struct ggml_tensor * inp_ffn = x_norm_ffn;
7988
+
7989
+ if (il == n_layer - 1 ) {
7990
+ // skip computing output for unused tokens
7991
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
7992
+ inp_ffn = ggml_get_rows (ctx0, x_norm_ffn, inp_out_ids);
7993
+ x_prev = ggml_get_rows (ctx0, x_prev, inp_out_ids);
7994
+ cur = ggml_get_rows (ctx0, cur, inp_out_ids);
7995
+ }
7996
+
7997
+ cur = ggml_add (ctx0, cur, llm_build_rwkv7_channel_mix (lctx, ctx0, layer, inp_ffn, x_prev));
7974
7998
ggml_build_forward_expand (gf, cur);
7975
7999
7976
8000
struct ggml_tensor * last_norm_att = ggml_view_3d (ctx0, x_norm_att, n_embd, 1 , n_seqs, x_norm_att->nb [1 ], x_norm_att->nb [2 ], (n_seq_tokens-1 )*n_embd*ggml_element_size (x_norm_att));
@@ -7999,10 +8023,6 @@ struct llm_build_context {
7999
8023
}
8000
8024
8001
8025
cur = inpL;
8002
- struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
8003
- cur = ggml_reshape_2d (ctx0, cur, n_embd, n_tokens);
8004
- cur = ggml_get_rows (ctx0, cur, inp_out_ids);
8005
-
8006
8026
cur = llm_build_norm (ctx0, cur, hparams, model.output_norm , model.output_norm_b , LLM_NORM, cb, -1 );
8007
8027
cb (cur, " result_norm" , -1 );
8008
8028
@@ -8084,6 +8104,13 @@ struct llm_build_context {
8084
8104
8085
8105
cb (ffn_inp, " ffn_inp" , il);
8086
8106
8107
+ if (il == n_layer - 1 ) {
8108
+ // skip computing output for unused tokens
8109
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
8110
+ cur = ggml_get_rows (ctx0, cur, inp_out_ids);
8111
+ ffn_inp = ggml_get_rows (ctx0, ffn_inp, inp_out_ids);
8112
+ }
8113
+
8087
8114
// feed-forward network
8088
8115
cur = llm_build_norm (ctx0, ffn_inp, hparams,
8089
8116
model.layers [il].ffn_norm , NULL ,
@@ -8107,10 +8134,6 @@ struct llm_build_context {
8107
8134
}
8108
8135
8109
8136
cur = inpL;
8110
- struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
8111
- cur = ggml_reshape_2d (ctx0, cur, n_embd, n_tokens);
8112
- cur = ggml_get_rows (ctx0, cur, inp_out_ids);
8113
-
8114
8137
cur = llm_build_norm (ctx0, cur, hparams, model.output_norm , model.output_norm_b , LLM_NORM_RMS, cb, -1 );
8115
8138
cb (cur, " result_norm" , -1 );
8116
8139
0 commit comments