47
47
#include < algorithm>
48
48
#include < initializer_list>
49
49
#include < thread>
50
- #include < atomic>
51
50
#include < mutex>
52
51
#include < sstream>
53
52
#include < numeric>
@@ -92,6 +91,53 @@ static const size_t MB = 1024*1024;
92
91
93
92
typedef void (*offload_func_t )(struct ggml_tensor * tensor);
94
93
94
+ #ifdef GGML_USE_CUBLAS
95
+ #define llama_host_malloc (n ) ggml_cuda_host_malloc(n)
96
+ #define llama_host_free (data ) ggml_cuda_host_free(data)
97
+ #elif GGML_USE_METAL
98
+ #define llama_host_malloc (n ) ggml_metal_host_malloc(n)
99
+ #define llama_host_free (data ) ggml_metal_host_free(data)
100
+ #else
101
+ #define llama_host_malloc (n ) malloc(n)
102
+ #define llama_host_free (data ) free(data)
103
+ #endif
104
+
105
+ struct llama_buffer {
106
+ void * data = NULL ;
107
+ size_t size = 0 ;
108
+
109
+ // fallback to malloc / free
110
+ // useful in cases where CUDA can try to allocate PINNED memory
111
+ bool fallback = false ;
112
+
113
+ void resize (size_t n) {
114
+ llama_host_free (data);
115
+
116
+ data = llama_host_malloc (n);
117
+ if (!data) {
118
+ fallback = true ;
119
+ data = malloc (n);
120
+ } else {
121
+ fallback = false ;
122
+ }
123
+
124
+ GGML_ASSERT (data);
125
+ size = n;
126
+ }
127
+
128
+ ~llama_buffer () {
129
+ if (data) {
130
+ if (fallback) { // NOLINT
131
+ free (data);
132
+ } else {
133
+ llama_host_free (data);
134
+ }
135
+ }
136
+
137
+ data = NULL ;
138
+ }
139
+ };
140
+
95
141
void llama_nop (struct ggml_tensor * tensor) { // don't offload by default
96
142
(void ) tensor;
97
143
}
@@ -254,7 +300,7 @@ struct llama_kv_cache {
254
300
255
301
struct ggml_context * ctx = NULL ;
256
302
257
- gguf_ctx_buffer buf;
303
+ llama_buffer buf;
258
304
259
305
int n; // number of tokens currently in the cache
260
306
@@ -305,7 +351,7 @@ struct llama_model {
305
351
struct ggml_context * ctx = NULL ;
306
352
307
353
// the model memory buffer
308
- gguf_ctx_buffer buf;
354
+ llama_buffer buf;
309
355
310
356
// model memory mapped file
311
357
std::unique_ptr<gguf_mmap> mapping;
@@ -394,15 +440,15 @@ struct llama_context {
394
440
395
441
// memory buffers used to evaluate the model
396
442
// TODO: move in llama_state
397
- gguf_ctx_buffer buf_compute;
443
+ llama_buffer buf_compute;
398
444
399
445
#ifdef LLAMA_USE_ALLOCATOR
400
- gguf_ctx_buffer buf_alloc;
446
+ llama_buffer buf_alloc;
401
447
ggml_allocr * alloc = NULL ;
402
448
#endif
403
449
404
450
#ifdef LLAMA_USE_SCRATCH
405
- gguf_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
451
+ llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
406
452
407
453
int buf_last = 0 ;
408
454
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@@ -416,15 +462,15 @@ struct llama_context {
416
462
ggml_mpi_context * ctx_mpi = NULL ;
417
463
#endif
418
464
419
- void use_buf (struct ggml_context * ctx, int i) {
465
+ static void use_buf (struct ggml_context * ctx, int i) {
420
466
#if defined(LLAMA_USE_SCRATCH)
421
467
size_t last_size = 0 ;
422
468
423
469
if (i == -1 ) {
424
470
last_size = ggml_set_scratch (ctx, { 0 , 0 , nullptr , });
425
471
} else {
426
472
auto & buf = buf_scratch[i];
427
- last_size = ggml_set_scratch (ctx, { 0 , buf.size , buf.addr , });
473
+ last_size = ggml_set_scratch (ctx, { 0 , buf.size , buf.data , });
428
474
}
429
475
430
476
if (buf_last >= 0 ) {
@@ -438,7 +484,7 @@ struct llama_context {
438
484
#endif
439
485
}
440
486
441
- size_t get_buf_max_mem (int i) const {
487
+ static size_t get_buf_max_mem (int i) {
442
488
#if defined(LLAMA_USE_SCRATCH)
443
489
return buf_max_size[i];
444
490
#else
@@ -1024,7 +1070,7 @@ static bool kv_cache_init(
1024
1070
1025
1071
struct ggml_init_params params;
1026
1072
params.mem_size = cache.buf .size ;
1027
- params.mem_buffer = cache.buf .addr ;
1073
+ params.mem_buffer = cache.buf .data ;
1028
1074
params.no_alloc = false ;
1029
1075
1030
1076
cache.ctx = ggml_init (params);
@@ -1275,13 +1321,13 @@ static void llama_model_load_internal(
1275
1321
{
1276
1322
model.buf .resize (ctx_size);
1277
1323
if (use_mlock) {
1278
- model.mlock_buf .init (model.buf .addr );
1324
+ model.mlock_buf .init (model.buf .data );
1279
1325
model.mlock_buf .grow_to (model.buf .size );
1280
1326
}
1281
1327
1282
1328
struct ggml_init_params params = {
1283
1329
/* .mem_size =*/ model.buf .size ,
1284
- /* .mem_buffer =*/ model.buf .addr ,
1330
+ /* .mem_buffer =*/ model.buf .data ,
1285
1331
/* .no_alloc =*/ ml->use_mmap ,
1286
1332
};
1287
1333
@@ -1565,7 +1611,7 @@ static struct ggml_cgraph * llama_build_graph(
1565
1611
1566
1612
struct ggml_init_params params = {
1567
1613
/* .mem_size =*/ buf_compute.size ,
1568
- /* .mem_buffer =*/ buf_compute.addr ,
1614
+ /* .mem_buffer =*/ buf_compute.data ,
1569
1615
/* .no_alloc =*/ false ,
1570
1616
};
1571
1617
@@ -3012,11 +3058,11 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
3012
3058
// quantization
3013
3059
//
3014
3060
3015
- static void llama_convert_tensor_internal (const gguf_load_tensor & tensor, gguf_buffer & output, const int nelements, const int nthread) {
3016
- if (output.size < nelements * sizeof ( float ) ) {
3017
- output.resize (nelements * sizeof ( float ) );
3061
+ static void llama_convert_tensor_internal (const gguf_load_tensor & tensor, std::vector< float > & output, const size_t nelements, const int nthread) {
3062
+ if (output.size () < nelements) {
3063
+ output.resize (nelements);
3018
3064
}
3019
- float * f32_output = (float *) output.addr ;
3065
+ float * f32_output = (float *) output.data () ;
3020
3066
3021
3067
ggml_type_traits_t qtype;
3022
3068
if (ggml_is_quantized (tensor.type )) {
@@ -3134,10 +3180,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3134
3180
};
3135
3181
3136
3182
size_t idx = 0 ;
3183
+
3184
+ std::vector<uint8_t > read_data;
3185
+ std::vector<uint8_t > work;
3186
+
3137
3187
for (gguf_load_tensor & tensor : model_loader->tensors_map .tensors ) {
3138
- gguf_buffer read_data;
3139
3188
read_data.resize (tensor.size );
3140
- tensor.data = read_data.addr ;
3189
+ tensor.data = read_data.data () ;
3141
3190
model_loader->load_data_for (tensor);
3142
3191
3143
3192
LLAMA_LOG_INFO (" [%4zu/%4zu] %36s - %16s, type = %6s, " ,
@@ -3156,7 +3205,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3156
3205
enum ggml_type new_type;
3157
3206
void * new_data;
3158
3207
size_t new_size;
3159
- gguf_buffer work;
3160
3208
3161
3209
if (!quantize) {
3162
3210
new_type = tensor.type ;
@@ -3214,35 +3262,36 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3214
3262
}
3215
3263
#endif
3216
3264
3265
+ const size_t nelements = tensor.ne .at (0 ) * tensor.ne .at (1 );
3266
+
3217
3267
float * f32_data;
3218
- size_t nelements = tensor.ne .at (0 ) * tensor.ne .at (1 );
3219
- gguf_buffer f32_conv_buf;
3268
+ std::vector<float > f32_conv_buf;
3220
3269
3221
3270
if (tensor.type == GGML_TYPE_F32) {
3222
3271
f32_data = (float *) tensor.data ;
3223
3272
} else if (ggml_is_quantized (tensor.type ) && !params->allow_requantize ) {
3224
3273
throw std::runtime_error (format (" requantizing from type %s is disabled" , ggml_type_name (tensor.type )));
3225
3274
} else {
3226
3275
llama_convert_tensor_internal (tensor, f32_conv_buf, nelements, nthread);
3227
- f32_data = (float *) f32_conv_buf.addr ;
3276
+ f32_data = (float *) f32_conv_buf.data () ;
3228
3277
}
3229
3278
3230
3279
LLAMA_LOG_INFO (" quantizing to %s .. " , ggml_type_name (new_type));
3231
3280
fflush (stdout);
3232
3281
3233
3282
work.resize (nelements * 4 ); // upper bound on size
3234
- new_data = work.addr ;
3283
+ new_data = work.data () ;
3235
3284
std::vector<int64_t > hist_cur (1 << 4 , 0 );
3236
3285
3237
- int chunk_size = 32 * 512 ;
3286
+ const int chunk_size = 32 * 512 ;
3238
3287
const int nchunk = (nelements + chunk_size - 1 )/chunk_size;
3239
3288
const int nthread_use = nthread > 1 ? std::max (1 , std::min (nthread, nchunk)) : 1 ;
3240
3289
if (nthread_use < 2 ) {
3241
3290
new_size = ggml_quantize_chunk (new_type, f32_data, new_data, 0 , nelements, hist_cur.data ());
3242
3291
} else {
3243
3292
size_t counter = 0 ;
3244
3293
new_size = 0 ;
3245
- auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size ] () {
3294
+ auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements] () {
3246
3295
std::vector<int64_t > local_hist;
3247
3296
size_t local_size = 0 ;
3248
3297
while (true ) {
@@ -3315,8 +3364,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3315
3364
}
3316
3365
}
3317
3366
3318
-
3319
-
3320
3367
//
3321
3368
// interface implementation
3322
3369
//
@@ -3438,7 +3485,7 @@ struct llama_context * llama_new_context_with_model(
3438
3485
ggml_allocr_free (ctx->alloc );
3439
3486
3440
3487
ctx->buf_alloc .resize (alloc_size);
3441
- ctx->alloc = ggml_allocr_new (ctx->buf_alloc .addr , ctx->buf_alloc .size , tensor_alignment);
3488
+ ctx->alloc = ggml_allocr_new (ctx->buf_alloc .data , ctx->buf_alloc .size , tensor_alignment);
3442
3489
}
3443
3490
#else
3444
3491
ctx->buf_compute .resize (MEM_REQ_EVAL ().at (ctx->model .type ) + ggml_graph_overhead ());
@@ -3479,11 +3526,11 @@ struct llama_context * llama_new_context_with_model(
3479
3526
3480
3527
LLAMA_METAL_CHECK_BUF (ggml_metal_add_buffer (ctx->ctx_metal , " data" , data_ptr, data_size, max_size));
3481
3528
3482
- LLAMA_METAL_CHECK_BUF (ggml_metal_add_buffer (ctx->ctx_metal , " eval" , ctx->buf_compute .addr , ctx->buf_compute .size , 0 ));
3483
- LLAMA_METAL_CHECK_BUF (ggml_metal_add_buffer (ctx->ctx_metal , " kv" , ctx->kv_self .buf .addr , ctx->kv_self .buf .size , 0 ));
3529
+ LLAMA_METAL_CHECK_BUF (ggml_metal_add_buffer (ctx->ctx_metal , " eval" , ctx->buf_compute .data , ctx->buf_compute .size , 0 ));
3530
+ LLAMA_METAL_CHECK_BUF (ggml_metal_add_buffer (ctx->ctx_metal , " kv" , ctx->kv_self .buf .data , ctx->kv_self .buf .size , 0 ));
3484
3531
3485
- LLAMA_METAL_CHECK_BUF (ggml_metal_add_buffer (ctx->ctx_metal , " scr0" , ctx->buf_scratch [0 ].addr , ctx->buf_scratch [0 ].size , 0 ));
3486
- LLAMA_METAL_CHECK_BUF (ggml_metal_add_buffer (ctx->ctx_metal , " scr1" , ctx->buf_scratch [1 ].addr , ctx->buf_scratch [1 ].size , 0 ));
3532
+ LLAMA_METAL_CHECK_BUF (ggml_metal_add_buffer (ctx->ctx_metal , " scr0" , ctx->buf_scratch [0 ].data , ctx->buf_scratch [0 ].size , 0 ));
3533
+ LLAMA_METAL_CHECK_BUF (ggml_metal_add_buffer (ctx->ctx_metal , " scr1" , ctx->buf_scratch [1 ].data , ctx->buf_scratch [1 ].size , 0 ));
3487
3534
#undef LLAMA_METAL_CHECK_BUF
3488
3535
}
3489
3536
#endif
@@ -3565,7 +3612,6 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3565
3612
3566
3613
LLAMA_LOG_INFO (" %s: r = %d, alpha = %d, scaling = %.2f\n " , __func__, lora_r, lora_alpha, scaling);
3567
3614
3568
-
3569
3615
// create a temporary ggml context to store the lora tensors
3570
3616
// todo: calculate size from biggest possible tensor
3571
3617
std::vector<uint8_t > lora_buf (1024ull * 1024ull * 1024ull );
@@ -3583,11 +3629,10 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3583
3629
model_tensors.insert (kv);
3584
3630
}
3585
3631
3586
-
3587
3632
// load base model
3588
3633
std::unique_ptr<llama_model_loader> model_loader;
3589
3634
ggml_context * base_ctx = NULL ;
3590
- gguf_buffer base_buf;
3635
+ std::vector< uint8_t > base_buf;
3591
3636
if (path_base_model) {
3592
3637
LLAMA_LOG_INFO (" %s: loading base model from '%s'\n " , __func__, path_base_model);
3593
3638
model_loader.reset (new llama_model_loader (path_base_model, /* use_mmap*/ true ));
@@ -3598,8 +3643,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3598
3643
base_buf.resize (ctx_size);
3599
3644
3600
3645
ggml_init_params base_params;
3601
- base_params.mem_size = base_buf.size ;
3602
- base_params.mem_buffer = base_buf.addr ;
3646
+ base_params.mem_size = base_buf.size () ;
3647
+ base_params.mem_buffer = base_buf.data () ;
3603
3648
base_params.no_alloc = model_loader->use_mmap ;
3604
3649
3605
3650
base_ctx = ggml_init (base_params);
0 commit comments