fix: fix usage of ALIBI

Joan Martinez · Joan Martinez · commit cf1c1447e32a · 2024-04-22T13:05:26.000+02:00
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -2175,17 +2175,12 @@ def __init__(self, *args, **kwargs):
         self.intermediate_size = self.hparams["intermediate_size"]
 
     def get_tensors(self):
-        import string
-        print(f'Intermediate SIZE: {self.intermediate_size}')
-
         for name, data in super().get_tensors():
             if 'gated_layers' in name:
-                print(f'name {name} => {data.shape}')
                 d1 = data[:self.intermediate_size, :]
                 name1 = name.replace('gated_layers', 'gated_layers_w')
                 d2 = data[self.intermediate_size:, :]
                 name2 = name.replace('gated_layers', 'gated_layers_v')
-                print(f'd1 {d1.shape}, d2 {d2.shape}')
                 yield name1, d1
                 yield name2, d2
                 continue
diff --git a/ggml.c b/ggml.c
@@ -5406,10 +5406,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
         GGML_ASSERT(pos->ne[0] == a->ne[0]);
     }
 
-    if (max_bias > 0.0f) {
-        GGML_ASSERT(pos);
-    }
-
     bool is_node = false;
 
     if (a->grad) {
@@ -12241,11 +12237,11 @@ static void ggml_compute_forward_soft_max_f32(
     float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
 
     // when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
-    float * pos = src2 ? (float *) src2->data : src0->data;
+    float * pos = src2 ? (float *) src2->data : NULL;
 
     for (int i1 = ir0; i1 < ir1; i1++) {
         float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
-        float * dp = (float *)((char *)  dst->data +  i1*dst->nb[1]);
+        float * dp = (float *)((char *)  dst->data + i1*dst->nb[1]);
 
         // broadcast the mask across rows
         float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
@@ -12262,7 +12258,7 @@ static void ggml_compute_forward_soft_max_f32(
             const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
 
             for (int i = 0; i < nc; i++) {
-                wp[i] = wp[i] + slope*pos[i];
+                wp[i] = wp[i] - slope*abs(i1%nc - i);
             }
         }
 
@@ -12478,7 +12474,7 @@ static void ggml_compute_forward_alibi_f32(
             for (int64_t j = 0; j < ne1; j++) {
                 float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
                 float *      pdst = (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
-                pdst[0] = i * m_k + src[0];
+                pdst[0] = -1.0f * i * m_k;
             }
         }
     }
@@ -16111,6 +16107,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
         return;
     }
 
+    fprintf(stdout, "Computing forward (%s) for tensor %s\n", GGML_OP_NAME[tensor->op], tensor->name);
     switch (tensor->op) {
         case GGML_OP_DUP:
             {
@@ -16447,6 +16444,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
                 GGML_ASSERT(false);
             } break;
     }
+    fprintf(stdout, "After FORWARD %s (%p): Shape:%li, %li, %li, %li tensor: %9.6f, %9.6f, %9.6f, %9.6f \n", tensor->name, tensor, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ((float *)(tensor->data))[0], ((float *)(tensor->data))[1], ((float *)(tensor->data))[2], ((float *)(tensor->data))[3]);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/llama.cpp b/llama.cpp
@@ -3795,6 +3795,7 @@ static void llm_load_hparams(
                 ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
                 ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
                 ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type);
+                hparams.f_max_alibi_bias = 8.0f;
 
                 switch (hparams.n_layer) {
                     case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
@@ -4001,7 +4002,7 @@ static void llm_load_hparams(
 
     model.ftype = ml.ftype;
 
-    if (hparams.f_max_alibi_bias > 0.0f) {
+    if (hparams.f_max_alibi_bias > 0.0f && model.arch != LLM_ARCH_JINA_BERT) {
         hparams.need_kq_pos = true;
     }
 
@@ -4519,7 +4520,6 @@ static bool llm_load_tensors(
         model.layers.resize(n_layer);
 
         const auto tn = LLM_TN(model.arch);
-        //std::printf("JOAN HERE ARCH %i", model.arch);
         switch (model.arch) {
             case LLM_ARCH_LLAMA:
             case LLM_ARCH_REFACT:
@@ -7525,7 +7525,7 @@ struct llm_build_context {
         struct ggml_tensor * inp_pos = nullptr;
 
         if (model.arch != LLM_ARCH_JINA_BERT) {
-            inp_pos  = build_inp_pos();
+            inp_pos = build_inp_pos();
         }
         struct ggml_tensor * inp_mean = build_inp_mean();
         struct ggml_tensor * inp_cls  = build_inp_cls();

Original file line number	Diff line number	Diff line change
`@@ -5406,10 +5406,6 @@ static struct ggml_tensor * ggml_soft_max_impl(`
`5406`	`5406`	`GGML_ASSERT(pos->ne[0] == a->ne[0]);`
`5407`	`5407`	`}`
`5408`	`5408`
`5409`		`- if (max_bias > 0.0f) {`
`5410`		`- GGML_ASSERT(pos);`
`5411`		`- }`
`5412`		`-`
`5413`	`5409`	`bool is_node = false;`
`5414`	`5410`
`5415`	`5411`	`if (a->grad) {`
`@@ -12241,11 +12237,11 @@ static void ggml_compute_forward_soft_max_f32(`
`12241`	`12237`	`float * wp = (float ) params->wdata + (nc + CACHE_LINE_SIZE_F32) ith;`
`12242`	`12238`
`12243`	`12239`	`// when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching`
`12244`		`- float * pos = src2 ? (float *) src2->data : src0->data;`
	`12240`	`+ float * pos = src2 ? (float *) src2->data : NULL;`
`12245`	`12241`
`12246`	`12242`	`for (int i1 = ir0; i1 < ir1; i1++) {`
`12247`	`12243`	`float * sp = (float )((char ) src0->data + i1*src0->nb[1]);`
`12248`		`- float * dp = (float )((char ) dst->data + i1*dst->nb[1]);`
	`12244`	`+ float * dp = (float )((char ) dst->data + i1*dst->nb[1]);`
`12249`	`12245`
`12250`	`12246`	`// broadcast the mask across rows`
`12251`	`12247`	`float * mp = src1 ? (float )((char ) src1->data + (i1%ne11)*src1->nb[1]) : NULL;`
`@@ -12262,7 +12258,7 @@ static void ggml_compute_forward_soft_max_f32(`
`12262`	`12258`	`const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);`
`12263`	`12259`
`12264`	`12260`	`for (int i = 0; i < nc; i++) {`
`12265`		`- wp[i] = wp[i] + slope*pos[i];`
	`12261`	`+ wp[i] = wp[i] - slope*abs(i1%nc - i);`
`12266`	`12262`	`}`
`12267`	`12263`	`}`
`12268`	`12264`
`@@ -12478,7 +12474,7 @@ static void ggml_compute_forward_alibi_f32(`
`12478`	`12474`	`for (int64_t j = 0; j < ne1; j++) {`
`12479`	`12475`	`float * const src = (float )((char ) src0->data + inb0 + jnb1 + k*nb2);`
`12480`	`12476`	`float * pdst = (float )((char ) dst->data + inb0 + jnb1 + k*nb2);`
`12481`		`- pdst[0] = i * m_k + src[0];`
	`12477`	`+ pdst[0] = -1.0f * i * m_k;`
`12482`	`12478`	`}`
`12483`	`12479`	`}`
`12484`	`12480`	`}`
`@@ -16111,6 +16107,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm`
`16111`	`16107`	`return;`
`16112`	`16108`	`}`
`16113`	`16109`
	`16110`	`+ fprintf(stdout, "Computing forward (%s) for tensor %s\n", GGML_OP_NAME[tensor->op], tensor->name);`
`16114`	`16111`	`switch (tensor->op) {`
`16115`	`16112`	`case GGML_OP_DUP:`
`16116`	`16113`	`{`
`@@ -16447,6 +16444,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm`
`16447`	`16444`	`GGML_ASSERT(false);`
`16448`	`16445`	`} break;`
`16449`	`16446`	`}`
	`16447`	`+ fprintf(stdout, "After FORWARD %s (%p): Shape:%li, %li, %li, %li tensor: %9.6f, %9.6f, %9.6f, %9.6f \n", tensor->name, tensor, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ((float )(tensor->data))[0], ((float )(tensor->data))[1], ((float )(tensor->data))[2], ((float )(tensor->data))[3]);`
`16450`	`16448`	`}`
`16451`	`16449`
`16452`	`16450`	`////////////////////////////////////////////////////////////////////////////////`