Skip to content

Commit 80ea089

Browse files
authored
llama : allow pooled embeddings on any model (#7477)
* create append_pooling operation; allow to specify attention_type; add last token pooling; update examples * find result_norm/result_embd tensors properly; update output allocation logic * only use embd output for pooling_type NONE * get rid of old causal_attn accessor * take out attention_type; add in llama_set_embeddings * bypass logits when doing non-NONE pooling
1 parent 0e64591 commit 80ea089

File tree

6 files changed

+130
-70
lines changed

6 files changed

+130
-70
lines changed

common/common.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
541541
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
542542
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
543543
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
544+
else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
544545
else { invalid_param = true; }
545546
return true;
546547
}
@@ -1869,6 +1870,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
18691870

18701871
options.push_back({ "backend" });
18711872
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
1873+
18721874
if (llama_supports_mlock()) {
18731875
options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
18741876
}

examples/embedding/embedding.cpp

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,10 @@ static std::vector<std::string> split_lines(const std::string & s) {
1717
return lines;
1818
}
1919

20-
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
21-
for (size_t i = 0; i < tokens.size(); i++) {
22-
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
20+
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
21+
size_t n_tokens = tokens.size();
22+
for (size_t i = 0; i < n_tokens; i++) {
23+
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
2324
}
2425
}
2526

@@ -40,13 +41,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
4041

4142
// try to get sequence embeddings - supported only when pooling_type is not NONE
4243
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
43-
if (embd == NULL) {
44-
embd = llama_get_embeddings_ith(ctx, i);
45-
if (embd == NULL) {
46-
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
47-
continue;
48-
}
49-
}
44+
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
5045

5146
float * out = output + batch.seq_id[i][0] * n_embd;
5247
//TODO: I would also add a parameter here to enable normalization or not.
@@ -97,6 +92,12 @@ int main(int argc, char ** argv) {
9792
const int n_ctx_train = llama_n_ctx_train(model);
9893
const int n_ctx = llama_n_ctx(ctx);
9994

95+
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
96+
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
97+
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
98+
return 1;
99+
}
100+
100101
if (n_ctx > n_ctx_train) {
101102
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
102103
__func__, n_ctx_train, n_ctx);

examples/gritlm/gritlm.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
4444

4545
// clear previous kv_cache values (irrelevant for embeddings)
4646
llama_kv_cache_clear(ctx);
47+
llama_set_embeddings(ctx, true);
4748
llama_set_causal_attn(ctx, false);
4849

4950
// run model
@@ -98,7 +99,9 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
9899
llama_token eos_token = llama_token_eos(mdl);
99100

100101
llama_kv_cache_clear(ctx);
102+
llama_set_embeddings(ctx, false);
101103
llama_set_causal_attn(ctx, true);
104+
102105
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
103106

104107
std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
@@ -166,8 +169,7 @@ int main(int argc, char * argv[]) {
166169

167170
llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
168171

169-
// create new context - set to embedding mode
170-
cparams.embeddings = true;
172+
// create generation context
171173
llama_context * ctx = llama_new_context_with_model(mdl, cparams);
172174

173175
// ### Embedding/Representation ###

examples/retrieval/retrieval.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,10 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
7373
return chunks;
7474
}
7575

76-
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
77-
for (size_t i = 0; i < tokens.size(); i++) {
78-
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
76+
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
77+
size_t n_tokens = tokens.size();
78+
for (size_t i = 0; i < n_tokens; i++) {
79+
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
7980
}
8081
}
8182

@@ -160,6 +161,12 @@ int main(int argc, char ** argv) {
160161
const int n_ctx_train = llama_n_ctx_train(model);
161162
const int n_ctx = llama_n_ctx(ctx);
162163

164+
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
165+
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
166+
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
167+
return 1;
168+
}
169+
163170
if (n_ctx > n_ctx_train) {
164171
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
165172
__func__, n_ctx_train, n_ctx);

llama.cpp

Lines changed: 98 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -7649,6 +7649,50 @@ struct llm_build_context {
76497649
return lctx.inp_s_seq;
76507650
}
76517651

7652+
struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
7653+
// find result_norm tensor for input
7654+
struct ggml_tensor * inp = nullptr;
7655+
for (int i = gf->n_nodes - 1; i >= 0; --i) {
7656+
inp = gf->nodes[i];
7657+
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
7658+
break;
7659+
} else {
7660+
inp = nullptr;
7661+
}
7662+
}
7663+
GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
7664+
7665+
struct ggml_tensor * cur;
7666+
7667+
switch (pooling_type) {
7668+
case LLAMA_POOLING_TYPE_MEAN:
7669+
{
7670+
struct ggml_tensor * inp_mean = build_inp_mean();
7671+
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
7672+
} break;
7673+
case LLAMA_POOLING_TYPE_CLS:
7674+
case LLAMA_POOLING_TYPE_LAST:
7675+
{
7676+
struct ggml_tensor * inp_cls = build_inp_cls();
7677+
cur = ggml_get_rows(ctx0, inp, inp_cls);
7678+
} break;
7679+
case LLAMA_POOLING_TYPE_NONE:
7680+
{
7681+
cur = inp;
7682+
} break;
7683+
default:
7684+
{
7685+
GGML_ASSERT(false && "unknown pooling type");
7686+
} break;
7687+
}
7688+
7689+
cb(cur, "result_embd_pooled", -1);
7690+
7691+
ggml_build_forward_expand(gf, cur);
7692+
7693+
return gf;
7694+
}
7695+
76527696
struct ggml_cgraph * build_llama() {
76537697
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
76547698

@@ -8629,8 +8673,6 @@ struct llm_build_context {
86298673
if (model.arch != LLM_ARCH_JINA_BERT_V2) {
86308674
inp_pos = build_inp_pos();
86318675
}
8632-
struct ggml_tensor * inp_mean = build_inp_mean();
8633-
struct ggml_tensor * inp_cls = build_inp_cls();
86348676

86358677
// construct input embeddings (token, type, position)
86368678
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
@@ -8805,28 +8847,6 @@ struct llm_build_context {
88058847
cur = inpL;
88068848
cb(cur, "result_embd", -1);
88078849

8808-
// pooling layer
8809-
switch (pooling_type) {
8810-
case LLAMA_POOLING_TYPE_NONE:
8811-
{
8812-
// nop
8813-
} break;
8814-
case LLAMA_POOLING_TYPE_MEAN:
8815-
{
8816-
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
8817-
cb(cur, "result_embd_pooled", -1);
8818-
} break;
8819-
case LLAMA_POOLING_TYPE_CLS:
8820-
{
8821-
cur = ggml_get_rows(ctx0, cur, inp_cls);
8822-
cb(cur, "result_embd_pooled", -1);
8823-
} break;
8824-
case LLAMA_POOLING_TYPE_UNSPECIFIED:
8825-
{
8826-
GGML_ASSERT(false && "Invalid pooling type");
8827-
} break;
8828-
}
8829-
88308850
ggml_build_forward_expand(gf, cur);
88318851

88328852
return gf;
@@ -11911,6 +11931,11 @@ static struct ggml_cgraph * llama_build_graph(
1191111931
GGML_ASSERT(false);
1191211932
}
1191311933

11934+
// add on pooling layer
11935+
if (lctx.cparams.embeddings) {
11936+
result = llm.append_pooling(result);
11937+
}
11938+
1191411939
llm.free();
1191511940

1191611941
return result;
@@ -12000,7 +12025,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
1200012025
// (!a || b) is a logical implication (a -> b)
1200112026
// !hparams.causal_attn -> !cparams.causal_attn
1200212027
(hparams.causal_attn || !cparams.causal_attn) &&
12003-
"causal attention with embedding models is not supported"
12028+
"causal attention is not supported by this model"
1200412029
);
1200512030

1200612031
if (lctx.inp_KQ_mask) {
@@ -12132,6 +12157,37 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
1213212157
}
1213312158
}
1213412159

12160+
if (cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
12161+
const int64_t n_tokens = batch.n_tokens;
12162+
12163+
GGML_ASSERT(lctx.inp_cls);
12164+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
12165+
12166+
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
12167+
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
12168+
12169+
std::vector<int> last_pos(n_tokens, -1);
12170+
std::vector<int> last_row(n_tokens, -1);
12171+
12172+
for (int i = 0; i < n_tokens; ++i) {
12173+
const llama_seq_id seq_id = batch.seq_id[i][0];
12174+
const llama_pos pos = batch.pos[i];
12175+
12176+
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
12177+
12178+
if (pos >= last_pos[seq_id]) {
12179+
last_pos[seq_id] = pos;
12180+
last_row[seq_id] = i;
12181+
}
12182+
}
12183+
12184+
for (int i = 0; i < n_tokens; ++i) {
12185+
if (last_row[i] >= 0) {
12186+
data[i] = last_row[i];
12187+
}
12188+
}
12189+
}
12190+
1213512191
if (kv_self.recurrent) {
1213612192
const int64_t n_kv = kv_self.n;
1213712193

@@ -12193,8 +12249,8 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
1219312249
const auto n_embd = hparams.n_embd;
1219412250

1219512251
// TODO: use a per-batch flag for logits presence instead
12196-
const bool has_logits = cparams.causal_attn;
12197-
const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
12252+
const bool has_logits = !cparams.embeddings;
12253+
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
1219812254

1219912255
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
1220012256
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
@@ -12324,11 +12380,13 @@ static int llama_decode_internal(
1232412380
std::vector<std::vector<llama_seq_id>> seq_id;
1232512381

1232612382
// count outputs
12327-
if (batch_all.logits) {
12383+
if (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) {
12384+
n_outputs = n_tokens_all;
12385+
} else if (batch_all.logits) {
1232812386
for (uint32_t i = 0; i < n_tokens_all; ++i) {
1232912387
n_outputs += batch_all.logits[i] != 0;
1233012388
}
12331-
} else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
12389+
} else if (lctx.logits_all) {
1233212390
n_outputs = n_tokens_all;
1233312391
} else {
1233412392
// keep last output only
@@ -12459,30 +12517,13 @@ static int llama_decode_internal(
1245912517
// no output
1246012518
res = nullptr;
1246112519
embd = nullptr;
12462-
} else if (!hparams.causal_attn) {
12463-
res = nullptr; // do not extract logits for embedding models such as BERT
12464-
12465-
// token or sequence embeddings
12466-
embd = gf->nodes[gf->n_nodes - 1];
12467-
12468-
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
1246912520
} else if (cparams.embeddings) {
12470-
// the embeddings could be in the second to last tensor, or any of the previous tensors
12471-
int i_embd = gf->n_nodes - 2;
12472-
for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
12473-
i_embd = gf->n_nodes - i;
12474-
if (i_embd < 0) { break; }
12475-
embd = gf->nodes[i_embd];
12476-
}
12477-
GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
12478-
12479-
// TODO: use a per-batch flag to know when to skip logits while keeping embeddings
12480-
if (!cparams.causal_attn) {
12481-
res = nullptr; // do not extract logits when not needed
12482-
// skip computing logits
12483-
// TODO: is this safe?
12484-
gf->n_nodes = i_embd + 1;
12521+
res = nullptr; // do not extract logits for embedding case
12522+
embd = gf->nodes[gf->n_nodes - 1];
12523+
if (strcmp(embd->name, "result_embd_pooled") != 0) {
12524+
embd = gf->nodes[gf->n_nodes - 2];
1248512525
}
12526+
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
1248612527
} else {
1248712528
embd = nullptr; // do not extract embeddings when not needed
1248812529
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
@@ -12551,11 +12592,10 @@ static int llama_decode_internal(
1255112592
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
1255212593
}
1255312594
} break;
12554-
case LLAMA_POOLING_TYPE_CLS:
1255512595
case LLAMA_POOLING_TYPE_MEAN:
12596+
case LLAMA_POOLING_TYPE_CLS:
12597+
case LLAMA_POOLING_TYPE_LAST:
1255612598
{
12557-
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
12558-
1255912599
// extract sequence embeddings
1256012600
auto & embd_seq_out = lctx.embd_seq;
1256112601
embd_seq_out.clear();
@@ -18112,6 +18152,10 @@ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)
1811218152
ctx->abort_callback_data = abort_callback_data;
1811318153
}
1811418154

18155+
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
18156+
ctx->cparams.embeddings = embeddings;
18157+
}
18158+
1811518159
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
1811618160
ctx->cparams.causal_attn = causal_attn;
1811718161
}

llama.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ extern "C" {
174174
LLAMA_POOLING_TYPE_NONE = 0,
175175
LLAMA_POOLING_TYPE_MEAN = 1,
176176
LLAMA_POOLING_TYPE_CLS = 2,
177+
LLAMA_POOLING_TYPE_LAST = 3,
177178
};
178179

179180
enum llama_split_mode {
@@ -293,7 +294,6 @@ extern "C" {
293294

294295
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
295296
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
296-
// (ignored if no pooling layer)
297297

298298
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
299299
float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -786,6 +786,10 @@ extern "C" {
786786
// Get the number of threads used for prompt and batch processing (multiple token).
787787
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
788788

789+
// Set whether the model is in embeddings model or not
790+
// If true, embeddings will be returned but logits will not
791+
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
792+
789793
// Set whether to use causal attention or not
790794
// If set to true, the model will only attend to the past tokens
791795
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);

0 commit comments

Comments
 (0)