@@ -7649,6 +7649,50 @@ struct llm_build_context {
7649
7649
return lctx.inp_s_seq;
7650
7650
}
7651
7651
7652
+ struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
7653
+ // find result_norm tensor for input
7654
+ struct ggml_tensor * inp = nullptr;
7655
+ for (int i = gf->n_nodes - 1; i >= 0; --i) {
7656
+ inp = gf->nodes[i];
7657
+ if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
7658
+ break;
7659
+ } else {
7660
+ inp = nullptr;
7661
+ }
7662
+ }
7663
+ GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
7664
+
7665
+ struct ggml_tensor * cur;
7666
+
7667
+ switch (pooling_type) {
7668
+ case LLAMA_POOLING_TYPE_MEAN:
7669
+ {
7670
+ struct ggml_tensor * inp_mean = build_inp_mean();
7671
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
7672
+ } break;
7673
+ case LLAMA_POOLING_TYPE_CLS:
7674
+ case LLAMA_POOLING_TYPE_LAST:
7675
+ {
7676
+ struct ggml_tensor * inp_cls = build_inp_cls();
7677
+ cur = ggml_get_rows(ctx0, inp, inp_cls);
7678
+ } break;
7679
+ case LLAMA_POOLING_TYPE_NONE:
7680
+ {
7681
+ cur = inp;
7682
+ } break;
7683
+ default:
7684
+ {
7685
+ GGML_ASSERT(false && "unknown pooling type");
7686
+ } break;
7687
+ }
7688
+
7689
+ cb(cur, "result_embd_pooled", -1);
7690
+
7691
+ ggml_build_forward_expand(gf, cur);
7692
+
7693
+ return gf;
7694
+ }
7695
+
7652
7696
struct ggml_cgraph * build_llama() {
7653
7697
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7654
7698
@@ -8629,8 +8673,6 @@ struct llm_build_context {
8629
8673
if (model.arch != LLM_ARCH_JINA_BERT_V2) {
8630
8674
inp_pos = build_inp_pos();
8631
8675
}
8632
- struct ggml_tensor * inp_mean = build_inp_mean();
8633
- struct ggml_tensor * inp_cls = build_inp_cls();
8634
8676
8635
8677
// construct input embeddings (token, type, position)
8636
8678
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
@@ -8805,28 +8847,6 @@ struct llm_build_context {
8805
8847
cur = inpL;
8806
8848
cb(cur, "result_embd", -1);
8807
8849
8808
- // pooling layer
8809
- switch (pooling_type) {
8810
- case LLAMA_POOLING_TYPE_NONE:
8811
- {
8812
- // nop
8813
- } break;
8814
- case LLAMA_POOLING_TYPE_MEAN:
8815
- {
8816
- cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
8817
- cb(cur, "result_embd_pooled", -1);
8818
- } break;
8819
- case LLAMA_POOLING_TYPE_CLS:
8820
- {
8821
- cur = ggml_get_rows(ctx0, cur, inp_cls);
8822
- cb(cur, "result_embd_pooled", -1);
8823
- } break;
8824
- case LLAMA_POOLING_TYPE_UNSPECIFIED:
8825
- {
8826
- GGML_ASSERT(false && "Invalid pooling type");
8827
- } break;
8828
- }
8829
-
8830
8850
ggml_build_forward_expand(gf, cur);
8831
8851
8832
8852
return gf;
@@ -11911,6 +11931,11 @@ static struct ggml_cgraph * llama_build_graph(
11911
11931
GGML_ASSERT(false);
11912
11932
}
11913
11933
11934
+ // add on pooling layer
11935
+ if (lctx.cparams.embeddings) {
11936
+ result = llm.append_pooling(result);
11937
+ }
11938
+
11914
11939
llm.free();
11915
11940
11916
11941
return result;
@@ -12000,7 +12025,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
12000
12025
// (!a || b) is a logical implication (a -> b)
12001
12026
// !hparams.causal_attn -> !cparams.causal_attn
12002
12027
(hparams.causal_attn || !cparams.causal_attn) &&
12003
- "causal attention with embedding models is not supported"
12028
+ "causal attention is not supported by this model "
12004
12029
);
12005
12030
12006
12031
if (lctx.inp_KQ_mask) {
@@ -12132,6 +12157,37 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
12132
12157
}
12133
12158
}
12134
12159
12160
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
12161
+ const int64_t n_tokens = batch.n_tokens;
12162
+
12163
+ GGML_ASSERT(lctx.inp_cls);
12164
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
12165
+
12166
+ uint32_t * data = (uint32_t *) lctx.inp_cls->data;
12167
+ memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
12168
+
12169
+ std::vector<int> last_pos(n_tokens, -1);
12170
+ std::vector<int> last_row(n_tokens, -1);
12171
+
12172
+ for (int i = 0; i < n_tokens; ++i) {
12173
+ const llama_seq_id seq_id = batch.seq_id[i][0];
12174
+ const llama_pos pos = batch.pos[i];
12175
+
12176
+ GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
12177
+
12178
+ if (pos >= last_pos[seq_id]) {
12179
+ last_pos[seq_id] = pos;
12180
+ last_row[seq_id] = i;
12181
+ }
12182
+ }
12183
+
12184
+ for (int i = 0; i < n_tokens; ++i) {
12185
+ if (last_row[i] >= 0) {
12186
+ data[i] = last_row[i];
12187
+ }
12188
+ }
12189
+ }
12190
+
12135
12191
if (kv_self.recurrent) {
12136
12192
const int64_t n_kv = kv_self.n;
12137
12193
@@ -12193,8 +12249,8 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
12193
12249
const auto n_embd = hparams.n_embd;
12194
12250
12195
12251
// TODO: use a per-batch flag for logits presence instead
12196
- const bool has_logits = cparams.causal_attn ;
12197
- const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
12252
+ const bool has_logits = ! cparams.embeddings ;
12253
+ const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
12198
12254
12199
12255
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
12200
12256
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
@@ -12324,11 +12380,13 @@ static int llama_decode_internal(
12324
12380
std::vector<std::vector<llama_seq_id>> seq_id;
12325
12381
12326
12382
// count outputs
12327
- if (batch_all.logits) {
12383
+ if (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) {
12384
+ n_outputs = n_tokens_all;
12385
+ } else if (batch_all.logits) {
12328
12386
for (uint32_t i = 0; i < n_tokens_all; ++i) {
12329
12387
n_outputs += batch_all.logits[i] != 0;
12330
12388
}
12331
- } else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) ) {
12389
+ } else if (lctx.logits_all) {
12332
12390
n_outputs = n_tokens_all;
12333
12391
} else {
12334
12392
// keep last output only
@@ -12459,30 +12517,13 @@ static int llama_decode_internal(
12459
12517
// no output
12460
12518
res = nullptr;
12461
12519
embd = nullptr;
12462
- } else if (!hparams.causal_attn) {
12463
- res = nullptr; // do not extract logits for embedding models such as BERT
12464
-
12465
- // token or sequence embeddings
12466
- embd = gf->nodes[gf->n_nodes - 1];
12467
-
12468
- GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
12469
12520
} else if (cparams.embeddings) {
12470
- // the embeddings could be in the second to last tensor, or any of the previous tensors
12471
- int i_embd = gf->n_nodes - 2;
12472
- for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
12473
- i_embd = gf->n_nodes - i;
12474
- if (i_embd < 0) { break; }
12475
- embd = gf->nodes[i_embd];
12476
- }
12477
- GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
12478
-
12479
- // TODO: use a per-batch flag to know when to skip logits while keeping embeddings
12480
- if (!cparams.causal_attn) {
12481
- res = nullptr; // do not extract logits when not needed
12482
- // skip computing logits
12483
- // TODO: is this safe?
12484
- gf->n_nodes = i_embd + 1;
12521
+ res = nullptr; // do not extract logits for embedding case
12522
+ embd = gf->nodes[gf->n_nodes - 1];
12523
+ if (strcmp(embd->name, "result_embd_pooled") != 0) {
12524
+ embd = gf->nodes[gf->n_nodes - 2];
12485
12525
}
12526
+ GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
12486
12527
} else {
12487
12528
embd = nullptr; // do not extract embeddings when not needed
12488
12529
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
@@ -12551,11 +12592,10 @@ static int llama_decode_internal(
12551
12592
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
12552
12593
}
12553
12594
} break;
12554
- case LLAMA_POOLING_TYPE_CLS:
12555
12595
case LLAMA_POOLING_TYPE_MEAN:
12596
+ case LLAMA_POOLING_TYPE_CLS:
12597
+ case LLAMA_POOLING_TYPE_LAST:
12556
12598
{
12557
- GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
12558
-
12559
12599
// extract sequence embeddings
12560
12600
auto & embd_seq_out = lctx.embd_seq;
12561
12601
embd_seq_out.clear();
@@ -18112,6 +18152,10 @@ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)
18112
18152
ctx->abort_callback_data = abort_callback_data;
18113
18153
}
18114
18154
18155
+ void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
18156
+ ctx->cparams.embeddings = embeddings;
18157
+ }
18158
+
18115
18159
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
18116
18160
ctx->cparams.causal_attn = causal_attn;
18117
18161
}
0 commit comments