Skip to content

Commit cf1c144

Browse files
author
Joan Martinez
committed
fix: fix usage of ALIBI
1 parent b00d38b commit cf1c144

File tree

3 files changed

+9
-16
lines changed

3 files changed

+9
-16
lines changed

convert-hf-to-gguf.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2175,17 +2175,12 @@ def __init__(self, *args, **kwargs):
21752175
self.intermediate_size = self.hparams["intermediate_size"]
21762176

21772177
def get_tensors(self):
2178-
import string
2179-
print(f'Intermediate SIZE: {self.intermediate_size}')
2180-
21812178
for name, data in super().get_tensors():
21822179
if 'gated_layers' in name:
2183-
print(f'name {name} => {data.shape}')
21842180
d1 = data[:self.intermediate_size, :]
21852181
name1 = name.replace('gated_layers', 'gated_layers_w')
21862182
d2 = data[self.intermediate_size:, :]
21872183
name2 = name.replace('gated_layers', 'gated_layers_v')
2188-
print(f'd1 {d1.shape}, d2 {d2.shape}')
21892184
yield name1, d1
21902185
yield name2, d2
21912186
continue

ggml.c

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5406,10 +5406,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
54065406
GGML_ASSERT(pos->ne[0] == a->ne[0]);
54075407
}
54085408

5409-
if (max_bias > 0.0f) {
5410-
GGML_ASSERT(pos);
5411-
}
5412-
54135409
bool is_node = false;
54145410

54155411
if (a->grad) {
@@ -12241,11 +12237,11 @@ static void ggml_compute_forward_soft_max_f32(
1224112237
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
1224212238

1224312239
// when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
12244-
float * pos = src2 ? (float *) src2->data : src0->data;
12240+
float * pos = src2 ? (float *) src2->data : NULL;
1224512241

1224612242
for (int i1 = ir0; i1 < ir1; i1++) {
1224712243
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
12248-
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
12244+
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
1224912245

1225012246
// broadcast the mask across rows
1225112247
float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
@@ -12262,7 +12258,7 @@ static void ggml_compute_forward_soft_max_f32(
1226212258
const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
1226312259

1226412260
for (int i = 0; i < nc; i++) {
12265-
wp[i] = wp[i] + slope*pos[i];
12261+
wp[i] = wp[i] - slope*abs(i1%nc - i);
1226612262
}
1226712263
}
1226812264

@@ -12478,7 +12474,7 @@ static void ggml_compute_forward_alibi_f32(
1247812474
for (int64_t j = 0; j < ne1; j++) {
1247912475
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
1248012476
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
12481-
pdst[0] = i * m_k + src[0];
12477+
pdst[0] = -1.0f * i * m_k;
1248212478
}
1248312479
}
1248412480
}
@@ -16111,6 +16107,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1611116107
return;
1611216108
}
1611316109

16110+
fprintf(stdout, "Computing forward (%s) for tensor %s\n", GGML_OP_NAME[tensor->op], tensor->name);
1611416111
switch (tensor->op) {
1611516112
case GGML_OP_DUP:
1611616113
{
@@ -16447,6 +16444,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1644716444
GGML_ASSERT(false);
1644816445
} break;
1644916446
}
16447+
fprintf(stdout, "After FORWARD %s (%p): Shape:%li, %li, %li, %li tensor: %9.6f, %9.6f, %9.6f, %9.6f \n", tensor->name, tensor, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ((float *)(tensor->data))[0], ((float *)(tensor->data))[1], ((float *)(tensor->data))[2], ((float *)(tensor->data))[3]);
1645016448
}
1645116449

1645216450
////////////////////////////////////////////////////////////////////////////////

llama.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3795,6 +3795,7 @@ static void llm_load_hparams(
37953795
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
37963796
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
37973797
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3798+
hparams.f_max_alibi_bias = 8.0f;
37983799

37993800
switch (hparams.n_layer) {
38003801
case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
@@ -4001,7 +4002,7 @@ static void llm_load_hparams(
40014002

40024003
model.ftype = ml.ftype;
40034004

4004-
if (hparams.f_max_alibi_bias > 0.0f) {
4005+
if (hparams.f_max_alibi_bias > 0.0f && model.arch != LLM_ARCH_JINA_BERT) {
40054006
hparams.need_kq_pos = true;
40064007
}
40074008

@@ -4519,7 +4520,6 @@ static bool llm_load_tensors(
45194520
model.layers.resize(n_layer);
45204521

45214522
const auto tn = LLM_TN(model.arch);
4522-
//std::printf("JOAN HERE ARCH %i", model.arch);
45234523
switch (model.arch) {
45244524
case LLM_ARCH_LLAMA:
45254525
case LLM_ARCH_REFACT:
@@ -7525,7 +7525,7 @@ struct llm_build_context {
75257525
struct ggml_tensor * inp_pos = nullptr;
75267526

75277527
if (model.arch != LLM_ARCH_JINA_BERT) {
7528-
inp_pos = build_inp_pos();
7528+
inp_pos = build_inp_pos();
75297529
}
75307530
struct ggml_tensor * inp_mean = build_inp_mean();
75317531
struct ggml_tensor * inp_cls = build_inp_cls();

0 commit comments

Comments
 (0)