@@ -850,9 +850,9 @@ struct LLM_TN {
850
850
//
851
851
852
852
static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
853
- { LLAMA_ROPE_SCALING_TYPE_NONE , "none" },
854
- { LLAMA_ROPE_SCALING_TYPE_LINEAR , "linear" },
855
- { LLAMA_ROPE_SCALING_TYPE_YARN , "yarn" },
853
+ { LLAMA_ROPE_SCALING_NONE , "none" },
854
+ { LLAMA_ROPE_SCALING_LINEAR , "linear" },
855
+ { LLAMA_ROPE_SCALING_YARN , "yarn" },
856
856
};
857
857
858
858
static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
@@ -862,7 +862,7 @@ static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
862
862
}
863
863
}
864
864
865
- return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED ;
865
+ return LLAMA_ROPE_SCALING_UNSPECIFIED ;
866
866
}
867
867
868
868
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
@@ -1581,7 +1581,7 @@ struct llama_hparams {
1581
1581
bool causal_attn = true;
1582
1582
bool need_kq_pos = false;
1583
1583
1584
- enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE ;
1584
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_NONE ;
1585
1585
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
1586
1586
1587
1587
bool operator!=(const llama_hparams & other) const {
@@ -3007,7 +3007,7 @@ static void llm_load_hparams(
3007
3007
std::string rope_scaling("linear");
3008
3008
ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
3009
3009
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
3010
- GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED );
3010
+ GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED );
3011
3011
3012
3012
// rope_freq_scale (inverse of the kv) is optional
3013
3013
float ropescale = 0.0f;
@@ -3655,7 +3655,7 @@ static bool llm_load_tensors(
3655
3655
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
3656
3656
}
3657
3657
3658
- if (split_mode == LLAMA_SPLIT_MODE_LAYER ) {
3658
+ if (split_mode == LLAMA_SPLIT_LAYER ) {
3659
3659
// calculate the split points
3660
3660
int device_count = llama_get_device_count();
3661
3661
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
@@ -3694,10 +3694,10 @@ static bool llm_load_tensors(
3694
3694
}
3695
3695
} else {
3696
3696
ggml_backend_buffer_type_t split_buft;
3697
- if (split_mode == LLAMA_SPLIT_MODE_ROW ) {
3697
+ if (split_mode == LLAMA_SPLIT_ROW ) {
3698
3698
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
3699
3699
} else {
3700
- // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
3700
+ // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported
3701
3701
split_buft = llama_default_buffer_type_offload(main_gpu);
3702
3702
}
3703
3703
// assign the repeating layers
@@ -5028,7 +5028,7 @@ struct llm_build_context {
5028
5028
n_kv (worst_case ? n_ctx : kv_self.n),
5029
5029
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
5030
5030
n_orig_ctx (cparams.n_yarn_orig_ctx),
5031
- pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE ),
5031
+ pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_NONE ),
5032
5032
rope_type (hparams.rope_type),
5033
5033
cb (cb),
5034
5034
buf_compute_meta (lctx.buf_compute_meta) {
@@ -6011,12 +6011,12 @@ struct llm_build_context {
6011
6011
cur = inpL;
6012
6012
6013
6013
// pooling layer
6014
- if (pooling_type == LLAMA_POOLING_TYPE_MEAN ) {
6014
+ if (pooling_type == LLAMA_POOLING_MEAN ) {
6015
6015
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
6016
- } else if (pooling_type == LLAMA_POOLING_TYPE_CLS ) {
6016
+ } else if (pooling_type == LLAMA_POOLING_CLS ) {
6017
6017
cur = ggml_get_rows(ctx0, cur, inp_cls);
6018
6018
} else {
6019
- GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type");
6019
+ GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
6020
6020
}
6021
6021
cb(cur, "result_embd", -1);
6022
6022
@@ -7684,7 +7684,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7684
7684
}
7685
7685
}
7686
7686
7687
- if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN ) {
7687
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN ) {
7688
7688
const int64_t n_tokens = batch.n_tokens;
7689
7689
7690
7690
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
@@ -7712,7 +7712,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7712
7712
}
7713
7713
}
7714
7714
7715
- if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS ) {
7715
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS ) {
7716
7716
const int64_t n_tokens = batch.n_tokens;
7717
7717
7718
7718
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
@@ -11286,7 +11286,7 @@ static int llama_apply_lora_from_file_internal(
11286
11286
struct llama_model_params llama_model_default_params() {
11287
11287
struct llama_model_params result = {
11288
11288
/*.n_gpu_layers =*/ 0,
11289
- /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER ,
11289
+ /*.split_mode =*/ LLAMA_SPLIT_LAYER ,
11290
11290
/*.main_gpu =*/ 0,
11291
11291
/*.tensor_split =*/ nullptr,
11292
11292
/*.progress_callback =*/ nullptr,
@@ -11312,7 +11312,7 @@ struct llama_context_params llama_context_default_params() {
11312
11312
/*.n_batch =*/ 512,
11313
11313
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
11314
11314
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
11315
- /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED ,
11315
+ /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED ,
11316
11316
/*.rope_freq_base =*/ 0.0f,
11317
11317
/*.rope_freq_scale =*/ 0.0f,
11318
11318
/*.yarn_ext_factor =*/ -1.0f,
@@ -11500,16 +11500,16 @@ struct llama_context * llama_new_context_with_model(
11500
11500
cparams.cb_eval_user_data = params.cb_eval_user_data;
11501
11501
11502
11502
auto rope_scaling_type = params.rope_scaling_type;
11503
- if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED ) {
11503
+ if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED ) {
11504
11504
rope_scaling_type = hparams.rope_scaling_type_train;
11505
11505
}
11506
11506
11507
- if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE ) {
11507
+ if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE ) {
11508
11508
cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
11509
11509
}
11510
11510
11511
11511
if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
11512
- cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
11512
+ cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
11513
11513
}
11514
11514
11515
11515
if (params.seed == LLAMA_DEFAULT_SEED) {
@@ -11543,8 +11543,8 @@ struct llama_context * llama_new_context_with_model(
11543
11543
}
11544
11544
#elif defined(GGML_USE_CUBLAS)
11545
11545
if (model->n_gpu_layers > 0) {
11546
- // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW , only the main GPU backend is used
11547
- if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW ) {
11546
+ // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW , only the main GPU backend is used
11547
+ if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW ) {
11548
11548
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
11549
11549
if (backend == nullptr) {
11550
11550
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
@@ -11553,7 +11553,7 @@ struct llama_context * llama_new_context_with_model(
11553
11553
}
11554
11554
ctx->backends.push_back(backend);
11555
11555
} else {
11556
- // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
11556
+ // LLAMA_SPLIT_LAYER requires a backend for each GPU
11557
11557
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
11558
11558
ggml_backend_t backend = ggml_backend_cuda_init(device);
11559
11559
if (backend == nullptr) {
0 commit comments