Skip to content

Commit 1734f3f

Browse files
author
Lorenzo Toniazzi
committed
Clean up
1 parent 1103bdb commit 1734f3f

File tree

2 files changed

+25
-109
lines changed

2 files changed

+25
-109
lines changed

examples/main/main.cpp

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -117,25 +117,6 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
117117
LOG_TEE("%s", text);
118118
}
119119

120-
#include "ggml-metal.h"
121-
122-
bool is_pointer_in_buffer_range(void *ptr, void *buffer_start, size_t buffer_size) {
123-
return (ptr >= (char*)buffer_start) && (ptr < ((char*)buffer_start + buffer_size));
124-
}
125-
126-
127-
void verify_tensor_allocation(struct ggml_context * ctx, ggml_backend_buffer_t buffer, size_t buffer_size) {
128-
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
129-
for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
130-
if (t->data != NULL) {
131-
if (!is_pointer_in_buffer_range(t->data, buffer, buffer_size)) {
132-
fprintf(stderr, "Tensor %s is not within the allocated buffer range.\n", t->name);
133-
} else {
134-
printf("Tensor %s is correctly allocated in the buffer.\n", t->name);
135-
}
136-
}
137-
}
138-
}
139120

140121
int main(int argc, char ** argv) {
141122

llama.cpp

Lines changed: 25 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -263,11 +263,27 @@ static struct lora_data * load_lora(struct lora_info * info) {
263263
result->lora_r = 1;
264264
result->lora_alpha = 1;
265265

266+
#ifdef GGML_USE_CUDA
267+
fprintf(stderr, "%s: using CUDA backend\n", __func__);
268+
result->backend = ggml_backend_cuda_init(0); // init device 0
269+
if (!result->backend) {
270+
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
271+
}
272+
#endif
273+
274+
#ifdef GGML_USE_METAL
266275
fprintf(stderr, "%s: using Metal backend\n", __func__);
267276
result->backend = ggml_backend_metal_init();
268277
if (!result->backend) {
269278
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
270279
}
280+
#endif
281+
282+
// if there aren't GPU Backends fallback to CPU backend
283+
if (!result->backend) {
284+
result->backend = ggml_backend_cpu_init();
285+
}
286+
271287

272288
struct llama_file_lora file(info->filename.c_str(), "rb");
273289
if (file.fp == NULL) {
@@ -320,30 +336,24 @@ static struct lora_data * load_lora(struct lora_info * info) {
320336
tensors_offset.push_back(offset);
321337
file.seek(nbytes, SEEK_CUR);
322338
}
323-
result->buffer = ggml_backend_alloc_ctx_tensors(result->ctx, result->backend);
324339

325-
// ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx, ggml_backend_metal_buffer_type());
340+
341+
342+
result->buffer = ggml_backend_alloc_ctx_tensors(result->ctx, result->backend);
326343
if (!result->buffer) {
327344
LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__);
328345
}
329346
// read tensor data
330347
result->data.resize(total_nbytes_pad);
331-
size_t data_offset = 0;
332348
for (size_t i = 0; i < tensors.size(); ++i) {
333349
struct ggml_tensor * tensor = tensors[i];
334350
size_t offset = tensors_offset[i];
335351
size_t nbytes = ggml_nbytes(tensor);
336-
size_t nbytes_pad = ggml_nbytes_pad(tensor);
337352
file.seek(offset, SEEK_SET);
338-
339353
std::vector<char> read_buf;
340-
read_buf.resize(ggml_nbytes(tensor));
341-
file.read_raw(read_buf.data(), ggml_nbytes(tensor));
342-
ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
343-
// tensor_tmp->data = result->data.data() + data_offset;
344-
// file.read_raw(tensor_tmp->data, nbytes);
345-
// data_offset += nbytes_pad;
346-
// ggml_backend_tensor_set(tensor, tensor_tmp->data, 0, ggml_nbytes(tensor));
354+
read_buf.resize(nbytes);
355+
file.read_raw(read_buf.data(), nbytes);
356+
ggml_backend_tensor_set(tensor, read_buf.data(), 0, nbytes);
347357
}
348358
return result;
349359
}
@@ -16344,7 +16354,7 @@ struct llama_context * llama_new_context_with_model(
1634416354

1634516355
llama_context * ctx = new llama_context(*model);
1634616356

16347-
/// LORA
16357+
/// LORA load start
1634816358
struct export_lora_params * lora_params = new struct export_lora_params;
1634916359
struct lora_info lora;
1635016360
lora.filename = params.hot_lora;
@@ -16365,27 +16375,6 @@ struct llama_context * llama_new_context_with_model(
1636516375
}
1636616376
// Assign data
1636716377
ctx->llora_data = *loras[0];
16368-
16369-
16370-
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft((ctx->llora_data).ctx, ggml_backend_metal_buffer_type());
16371-
if (!buf) {
16372-
LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__);
16373-
}
16374-
// Looks this worked, need to check if tensors have new buffer (not sure below).
16375-
// Also do we need to set the tensors? not clear where data is, looks like it is loaded after the
16376-
// tensor creation in context, but loaded where? cuz if data present dfferebt way to set with ggml_backend_tensor_set instead of ggml_backend_tensor_alloc
16377-
16378-
// TODO looks like I have already a context with load_lora, understand if
16379-
// I am using it
16380-
// If the contexg it set to right buffer with ggml_backend_alloc_ctx_tensors_from_buft((ctx->llora_data).ctx, ggml_backend_metal_buffer_type());
16381-
// As I should already have created the tensors in the context,
16382-
// Understand where are the weights loaded instead
16383-
// Load the weight/data in the context
16384-
// Maybe check finetuning approach at managing the lora weights.
16385-
16386-
16387-
16388-
// build the map? TODO LORA ctx->lora_weights_map layers seem to not have buffer type but it should as the simple example does
1638916378
ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
1639016379
// std::vector<std::string> keys;
1639116380
// for (const auto& pair : ctx->lora_weights_map) {
@@ -16398,63 +16387,9 @@ struct llama_context * llama_new_context_with_model(
1639816387
// ggml_tensor * tensorB_ctx = ggml_new_tensor((ctx->llora_data).ctx, tensorB->type, 4, tensorB->ne);
1639916388

1640016389
// }
16401-
16402-
// for (struct ggml_tensor * cur = ggml_get_first_tensor((ctx->llora_data).ctx); cur != NULL; cur = ggml_get_next_tensor((ctx->llora_data).ctx, cur)) {
16403-
// const auto * name = ggml_get_name(cur);
16404-
// // ggml_backend_tensor_set(tensorA, tensorA->data, 0, ggml_nbytes(tensorA));
16405-
// // ggml_backend_tensor_set(tensorB, tensorB->data, 0, ggml_nbytes(tensorB));
16390+
}
1640616391

16407-
// }
16408-
16409-
// for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
16410-
// const auto * weight = get_weight(ggml_get_name(cur));
16411-
// if (weight == nullptr) {
16412-
// // this can happen with split experts models
16413-
// continue;
16414-
// }
16415-
16416-
// if (progress_callback) {
16417-
// if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
16418-
// return false;
16419-
// }
16420-
// }
16421-
16422-
// size_t n_size = ggml_nbytes(cur);
16423-
16424-
// if (use_mmap) {
16425-
// const auto & mapping = mappings.at(weight->idx);
16426-
// ggml_backend_buffer_t buf_mmap = nullptr;
16427-
// if (bufs_mmap.count(weight->idx)) {
16428-
// buf_mmap = bufs_mmap.at(weight->idx);
16429-
// }
16430-
// uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
16431-
16432-
// if (check_tensors) {
16433-
// validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
16434-
// return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
16435-
// }));
16436-
// }
16437-
// // TODO LORA allocation of base tensors
16438-
// GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
16439-
// if (buf_mmap && cur->data == nullptr) {
16440-
// ggml_backend_tensor_alloc(buf_mmap, cur, data);
16441-
// if (lmlocks) {
16442-
// const auto & lmlock = lmlocks->at(weight->idx);
16443-
// lmlock->grow_to(weight->offs + n_size);
16444-
// }
16445-
16446-
// auto & mmap_used = mmaps_used[weight->idx];
16447-
// mmap_used.first = std::min(mmap_used.first, weight->offs);
16448-
// mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
16449-
// } else {
16450-
// ggml_backend_tensor_set(cur, data, 0, n_size);
16451-
16452-
16453-
16454-
16455-
}
16456-
16457-
/// LORA
16392+
/// LORA load end
1645816393

1645916394
const auto & hparams = model->hparams;
1646016395
auto & cparams = ctx->cparams;

0 commit comments

Comments
 (0)