@@ -7775,7 +7775,6 @@ struct llm_build_context {
7775
7775
7776
7776
cur = inpL;
7777
7777
struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
7778
- cur = ggml_reshape_2d (ctx0, cur, n_embd, n_tokens);
7779
7778
cur = ggml_get_rows (ctx0, cur, inp_out_ids);
7780
7779
7781
7780
cur = llm_build_norm (ctx0, cur, hparams, model.output_norm , model.output_norm_b , LLM_NORM, cb, -1 );
@@ -7863,6 +7862,13 @@ struct llm_build_context {
7863
7862
7864
7863
cb (ffn_inp, " ffn_inp" , il);
7865
7864
7865
+ if (il == n_layer - 1 ) {
7866
+ // skip computing output for unused tokens
7867
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
7868
+ cur = ggml_get_rows (ctx0, cur, inp_out_ids);
7869
+ ffn_inp = ggml_get_rows (ctx0, ffn_inp, inp_out_ids);
7870
+ }
7871
+
7866
7872
// feed-forward network
7867
7873
cur = llm_build_norm (ctx0, ffn_inp, hparams,
7868
7874
model.layers [il].ffn_norm , NULL ,
@@ -7886,10 +7892,6 @@ struct llm_build_context {
7886
7892
}
7887
7893
7888
7894
cur = inpL;
7889
- struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
7890
- cur = ggml_reshape_2d (ctx0, cur, n_embd, n_tokens);
7891
- cur = ggml_get_rows (ctx0, cur, inp_out_ids);
7892
-
7893
7895
cur = llm_build_norm (ctx0, cur, hparams, model.output_norm , model.output_norm_b , LLM_NORM_RMS, cb, -1 );
7894
7896
cb (cur, " result_norm" , -1 );
7895
7897
@@ -8000,7 +8002,6 @@ struct llm_build_context {
8000
8002
8001
8003
cur = inpL;
8002
8004
struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
8003
- cur = ggml_reshape_2d (ctx0, cur, n_embd, n_tokens);
8004
8005
cur = ggml_get_rows (ctx0, cur, inp_out_ids);
8005
8006
8006
8007
cur = llm_build_norm (ctx0, cur, hparams, model.output_norm , model.output_norm_b , LLM_NORM, cb, -1 );
@@ -8084,6 +8085,13 @@ struct llm_build_context {
8084
8085
8085
8086
cb (ffn_inp, " ffn_inp" , il);
8086
8087
8088
+ if (il == n_layer - 1 ) {
8089
+ // skip computing output for unused tokens
8090
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
8091
+ cur = ggml_get_rows (ctx0, cur, inp_out_ids);
8092
+ ffn_inp = ggml_get_rows (ctx0, ffn_inp, inp_out_ids);
8093
+ }
8094
+
8087
8095
// feed-forward network
8088
8096
cur = llm_build_norm (ctx0, ffn_inp, hparams,
8089
8097
model.layers [il].ffn_norm , NULL ,
@@ -8107,10 +8115,6 @@ struct llm_build_context {
8107
8115
}
8108
8116
8109
8117
cur = inpL;
8110
- struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
8111
- cur = ggml_reshape_2d (ctx0, cur, n_embd, n_tokens);
8112
- cur = ggml_get_rows (ctx0, cur, inp_out_ids);
8113
-
8114
8118
cur = llm_build_norm (ctx0, cur, hparams, model.output_norm , model.output_norm_b , LLM_NORM_RMS, cb, -1 );
8115
8119
cb (cur, " result_norm" , -1 );
8116
8120
0 commit comments