fix: add some changes as per review

Joan Martinez · Joan Martinez · commit da9636853561 · 2024-04-30T14:15:50.000+02:00
diff --git a/ggml.c b/ggml.c
@@ -12436,13 +12436,13 @@ static void ggml_compute_forward_soft_max_f32(
 
             if (use_f16) {
                 for (int i = 0; i < nc; ++i) {
-                    wp[i] += slope*GGML_FP16_TO_FP32(pos_f16[i]);
-                    //wp[i] = wp[i] - slope*abs(i1%nc - i);
+                    //wp[i] -= slope*GGML_FP16_TO_FP32(pos_f16[i]);
+                    wp[i] -= slope*abs(i1%nc - i);
                 }
             } else {
                 for (int i = 0; i < nc; ++i) {
-                    wp[i] += slope*pos_f32[i];
-                    //wp[i] = wp[i] - slope*abs(i1%nc - i);
+                    //wp[i] -= slope*pos_f32[i];
+                    wp[i] -= slope*abs(i1%nc - i);
                 }
             }
         }
diff --git a/llama.cpp b/llama.cpp
@@ -8254,6 +8254,9 @@ struct llm_build_context {
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
         struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false);
 
+        // positions of the tokens in the KV cache
+        struct ggml_tensor * KQ_pos = build_inp_KQ_pos(false);
+
         // iterate layers
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * cur = inpL;
@@ -8322,7 +8325,7 @@ struct llm_build_context {
             struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
             cb(kq, "kq", il);
 
-            kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
+            kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, KQ_pos, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
             cb(kq, "kq_soft_max_ext", il);
 
             struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
@@ -11523,7 +11526,7 @@ static int llama_decode_internal(
         }
 
         // non-causal masks do not use the KV cache
-        if (hparams.causal_attn) {
+        if (hparams.causal_attn || model.arch == LLM_ARCH_JINA_BERT_V2) {
             llama_kv_cache_update(&lctx);
 
             // if we have enough unused cells before the current head ->

Original file line number	Diff line number	Diff line change
`@@ -12436,13 +12436,13 @@ static void ggml_compute_forward_soft_max_f32(`
`12436`	`12436`
`12437`	`12437`	`if (use_f16) {`
`12438`	`12438`	`for (int i = 0; i < nc; ++i) {`
`12439`		`- wp[i] += slope*GGML_FP16_TO_FP32(pos_f16[i]);`
`12440`		`- //wp[i] = wp[i] - slope*abs(i1%nc - i);`
	`12439`	`+ //wp[i] -= slope*GGML_FP16_TO_FP32(pos_f16[i]);`
	`12440`	`+ wp[i] -= slope*abs(i1%nc - i);`
`12441`	`12441`	`}`
`12442`	`12442`	`} else {`
`12443`	`12443`	`for (int i = 0; i < nc; ++i) {`
`12444`		`- wp[i] += slope*pos_f32[i];`
`12445`		`- //wp[i] = wp[i] - slope*abs(i1%nc - i);`
	`12444`	`+ //wp[i] -= slope*pos_f32[i];`
	`12445`	`+ wp[i] -= slope*abs(i1%nc - i);`
`12446`	`12446`	`}`
`12447`	`12447`	`}`
`12448`	`12448`	`}`