@@ -263,11 +263,27 @@ static struct lora_data * load_lora(struct lora_info * info) {
263
263
result->lora_r = 1;
264
264
result->lora_alpha = 1;
265
265
266
+ #ifdef GGML_USE_CUDA
267
+ fprintf(stderr, "%s: using CUDA backend\n", __func__);
268
+ result->backend = ggml_backend_cuda_init(0); // init device 0
269
+ if (!result->backend) {
270
+ fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
271
+ }
272
+ #endif
273
+
274
+ #ifdef GGML_USE_METAL
266
275
fprintf(stderr, "%s: using Metal backend\n", __func__);
267
276
result->backend = ggml_backend_metal_init();
268
277
if (!result->backend) {
269
278
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
270
279
}
280
+ #endif
281
+
282
+ // if there aren't GPU Backends fallback to CPU backend
283
+ if (!result->backend) {
284
+ result->backend = ggml_backend_cpu_init();
285
+ }
286
+
271
287
272
288
struct llama_file_lora file(info->filename.c_str(), "rb");
273
289
if (file.fp == NULL) {
@@ -320,30 +336,24 @@ static struct lora_data * load_lora(struct lora_info * info) {
320
336
tensors_offset.push_back(offset);
321
337
file.seek(nbytes, SEEK_CUR);
322
338
}
323
- result->buffer = ggml_backend_alloc_ctx_tensors(result->ctx, result->backend);
324
339
325
- // ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx, ggml_backend_metal_buffer_type());
340
+
341
+
342
+ result->buffer = ggml_backend_alloc_ctx_tensors(result->ctx, result->backend);
326
343
if (!result->buffer) {
327
344
LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__);
328
345
}
329
346
// read tensor data
330
347
result->data.resize(total_nbytes_pad);
331
- size_t data_offset = 0;
332
348
for (size_t i = 0; i < tensors.size(); ++i) {
333
349
struct ggml_tensor * tensor = tensors[i];
334
350
size_t offset = tensors_offset[i];
335
351
size_t nbytes = ggml_nbytes(tensor);
336
- size_t nbytes_pad = ggml_nbytes_pad(tensor);
337
352
file.seek(offset, SEEK_SET);
338
-
339
353
std::vector<char> read_buf;
340
- read_buf.resize(ggml_nbytes(tensor));
341
- file.read_raw(read_buf.data(), ggml_nbytes(tensor));
342
- ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
343
- // tensor_tmp->data = result->data.data() + data_offset;
344
- // file.read_raw(tensor_tmp->data, nbytes);
345
- // data_offset += nbytes_pad;
346
- // ggml_backend_tensor_set(tensor, tensor_tmp->data, 0, ggml_nbytes(tensor));
354
+ read_buf.resize(nbytes);
355
+ file.read_raw(read_buf.data(), nbytes);
356
+ ggml_backend_tensor_set(tensor, read_buf.data(), 0, nbytes);
347
357
}
348
358
return result;
349
359
}
@@ -16344,7 +16354,7 @@ struct llama_context * llama_new_context_with_model(
16344
16354
16345
16355
llama_context * ctx = new llama_context(*model);
16346
16356
16347
- /// LORA
16357
+ /// LORA load start
16348
16358
struct export_lora_params * lora_params = new struct export_lora_params;
16349
16359
struct lora_info lora;
16350
16360
lora.filename = params.hot_lora;
@@ -16365,27 +16375,6 @@ struct llama_context * llama_new_context_with_model(
16365
16375
}
16366
16376
// Assign data
16367
16377
ctx->llora_data = *loras[0];
16368
-
16369
-
16370
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft((ctx->llora_data).ctx, ggml_backend_metal_buffer_type());
16371
- if (!buf) {
16372
- LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__);
16373
- }
16374
- // Looks this worked, need to check if tensors have new buffer (not sure below).
16375
- // Also do we need to set the tensors? not clear where data is, looks like it is loaded after the
16376
- // tensor creation in context, but loaded where? cuz if data present dfferebt way to set with ggml_backend_tensor_set instead of ggml_backend_tensor_alloc
16377
-
16378
- // TODO looks like I have already a context with load_lora, understand if
16379
- // I am using it
16380
- // If the contexg it set to right buffer with ggml_backend_alloc_ctx_tensors_from_buft((ctx->llora_data).ctx, ggml_backend_metal_buffer_type());
16381
- // As I should already have created the tensors in the context,
16382
- // Understand where are the weights loaded instead
16383
- // Load the weight/data in the context
16384
- // Maybe check finetuning approach at managing the lora weights.
16385
-
16386
-
16387
-
16388
- // build the map? TODO LORA ctx->lora_weights_map layers seem to not have buffer type but it should as the simple example does
16389
16378
ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
16390
16379
// std::vector<std::string> keys;
16391
16380
// for (const auto& pair : ctx->lora_weights_map) {
@@ -16398,63 +16387,9 @@ struct llama_context * llama_new_context_with_model(
16398
16387
// ggml_tensor * tensorB_ctx = ggml_new_tensor((ctx->llora_data).ctx, tensorB->type, 4, tensorB->ne);
16399
16388
16400
16389
// }
16401
-
16402
- // for (struct ggml_tensor * cur = ggml_get_first_tensor((ctx->llora_data).ctx); cur != NULL; cur = ggml_get_next_tensor((ctx->llora_data).ctx, cur)) {
16403
- // const auto * name = ggml_get_name(cur);
16404
- // // ggml_backend_tensor_set(tensorA, tensorA->data, 0, ggml_nbytes(tensorA));
16405
- // // ggml_backend_tensor_set(tensorB, tensorB->data, 0, ggml_nbytes(tensorB));
16390
+ }
16406
16391
16407
- // }
16408
-
16409
- // for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
16410
- // const auto * weight = get_weight(ggml_get_name(cur));
16411
- // if (weight == nullptr) {
16412
- // // this can happen with split experts models
16413
- // continue;
16414
- // }
16415
-
16416
- // if (progress_callback) {
16417
- // if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
16418
- // return false;
16419
- // }
16420
- // }
16421
-
16422
- // size_t n_size = ggml_nbytes(cur);
16423
-
16424
- // if (use_mmap) {
16425
- // const auto & mapping = mappings.at(weight->idx);
16426
- // ggml_backend_buffer_t buf_mmap = nullptr;
16427
- // if (bufs_mmap.count(weight->idx)) {
16428
- // buf_mmap = bufs_mmap.at(weight->idx);
16429
- // }
16430
- // uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
16431
-
16432
- // if (check_tensors) {
16433
- // validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
16434
- // return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
16435
- // }));
16436
- // }
16437
- // // TODO LORA allocation of base tensors
16438
- // GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
16439
- // if (buf_mmap && cur->data == nullptr) {
16440
- // ggml_backend_tensor_alloc(buf_mmap, cur, data);
16441
- // if (lmlocks) {
16442
- // const auto & lmlock = lmlocks->at(weight->idx);
16443
- // lmlock->grow_to(weight->offs + n_size);
16444
- // }
16445
-
16446
- // auto & mmap_used = mmaps_used[weight->idx];
16447
- // mmap_used.first = std::min(mmap_used.first, weight->offs);
16448
- // mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
16449
- // } else {
16450
- // ggml_backend_tensor_set(cur, data, 0, n_size);
16451
-
16452
-
16453
-
16454
-
16455
- }
16456
-
16457
- /// LORA
16392
+ /// LORA load end
16458
16393
16459
16394
const auto & hparams = model->hparams;
16460
16395
auto & cparams = ctx->cparams;
0 commit comments