@@ -468,11 +468,13 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
468
468
# LLAMA_POOLING_TYPE_NONE = 0,
469
469
# LLAMA_POOLING_TYPE_MEAN = 1,
470
470
# LLAMA_POOLING_TYPE_CLS = 2,
471
+ # LLAMA_POOLING_TYPE_LAST = 3,
471
472
# };
472
473
LLAMA_POOLING_TYPE_UNSPECIFIED = - 1
473
474
LLAMA_POOLING_TYPE_NONE = 0
474
475
LLAMA_POOLING_TYPE_MEAN = 1
475
476
LLAMA_POOLING_TYPE_CLS = 2
477
+ LLAMA_POOLING_TYPE_LAST = 3
476
478
477
479
# enum llama_split_mode {
478
480
# LLAMA_SPLIT_MODE_NONE = 0, // single GPU
@@ -761,7 +763,6 @@ class llama_model_params(ctypes.Structure):
761
763
762
764
# enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
763
765
# enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
764
- # // (ignored if no pooling layer)
765
766
766
767
# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
767
768
# float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -2316,6 +2317,16 @@ def llama_n_threads_batch(ctx: llama_context_p, /) -> int:
2316
2317
...
2317
2318
2318
2319
2320
+ # // Set whether the model is in embeddings model or not
2321
+ # // If true, embeddings will be returned but logits will not
2322
+ # LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
2323
+ @ctypes_function ("llama_set_embeddings" , [llama_context_p_ctypes , ctypes .c_bool ], None )
2324
+ def llama_set_embeddings (ctx : llama_context_p , embeddings : bool , / ):
2325
+ """Set whether the model is in embeddings model or not
2326
+ If true, embeddings will be returned but logits will not"""
2327
+ ...
2328
+
2329
+
2319
2330
# // Set whether to use causal attention or not
2320
2331
# // If set to true, the model will only attend to the past tokens
2321
2332
# LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
0 commit comments