@@ -2831,7 +2831,7 @@ struct llama_context {
2831
2831
struct llama_lora_weight {
2832
2832
struct ggml_tensor * a = nullptr;
2833
2833
struct ggml_tensor * b = nullptr;
2834
- llama_lora_weight() {}
2834
+ llama_lora_weight() = default;
2835
2835
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
2836
2836
};
2837
2837
@@ -18519,13 +18519,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18519
18519
}
18520
18520
18521
18521
static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
18522
- static const int n_inp_tensors = 5; // see llama_model
18523
- static const int n_out_tensors = 5; // see llama_model
18524
18522
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora);
18525
18523
18526
18524
ggml_context * ctx = nullptr;
18527
18525
struct gguf_init_params meta_gguf_params = {
18528
- /* .no_alloc = */ false ,
18526
+ /* .no_alloc = */ true ,
18529
18527
/* .ctx = */ &ctx,
18530
18528
};
18531
18529
struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params);
@@ -18536,58 +18534,43 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
18536
18534
// check metadata
18537
18535
{
18538
18536
auto get_kv_str = [&](std::string key) -> std::string {
18539
- std::vector<char> str_buf(32, 0); // we only get the arch, so no need big buffer here
18540
18537
int id = gguf_find_key(ctx_gguf, key.c_str());
18541
18538
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
18542
18539
};
18543
18540
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
18544
18541
auto lora_arch_name = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
18545
18542
auto lora_arch = llm_arch_from_string(lora_arch_name);
18546
18543
if (lora_arch != model->arch) {
18544
+ gguf_free(ctx_gguf);
18547
18545
throw std::runtime_error("model arch and LoRA arch mismatch");
18548
18546
}
18547
+
18549
18548
auto train_type = get_kv_str(llm_kv(LLM_KV_TRAINING_TYPE));
18550
18549
if (train_type != "finetune_lora") {
18550
+ gguf_free(ctx_gguf);
18551
18551
throw std::runtime_error("expect training.type to be finetune_lora, but got: " + train_type);
18552
18552
}
18553
18553
}
18554
18554
18555
- // calculate n_tensors_per_layer
18556
- int n_tensors_per_layer = 0;
18557
- {
18558
- int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
18559
- for (int i = 0; i < n_tensors; i++) {
18560
- int il = -1;
18561
- sscanf(gguf_get_tensor_name(ctx_gguf, i), "blk.%d.", &il);
18562
- if (il == 0) n_tensors_per_layer++;
18563
- }
18564
- }
18565
-
18566
- // count layer buffer types
18567
- std::map<ggml_backend_buffer_type_t, int> buft_tensor_count;
18568
- for (int64_t i = 0; i < model->hparams.n_layer; i++) {
18569
- buft_tensor_count[model->buft_layer[i].buft] += n_tensors_per_layer;
18570
- }
18571
- buft_tensor_count[model->buft_input.buft] += n_inp_tensors;
18572
- buft_tensor_count[model->buft_output.buft] += n_out_tensors;
18555
+ int n_tensors = gguf_get_n_tensors(ctx_gguf);
18573
18556
18574
- // allocate contexts
18557
+ // contexts for each buffer type
18575
18558
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
18576
- {
18577
- auto new_ggml_ctx = [](size_t n_tensors) {
18559
+ auto get_ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
18560
+ auto it = ctx_map.find(buft);
18561
+ if (it == ctx_map.end()) {
18562
+ // add a new context
18578
18563
struct ggml_init_params params = {
18579
18564
/*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
18580
18565
/*.mem_buffer =*/ NULL,
18581
18566
/*.no_alloc =*/ true,
18582
18567
};
18583
- return ggml_init(params);
18568
+ ggml_context * buft_ctx = ggml_init(params);
18569
+ ctx_map[buft] = buft_ctx;
18570
+ return buft_ctx;
18584
18571
};
18585
- for (auto & it : buft_tensor_count) {
18586
- int n_tensors = it.second;
18587
- // LLAMA_LOG_INFO("buf %p layers %d\n", it.first, it.second);
18588
- ctx_map[it.first] = new_ggml_ctx(2*n_tensors); // for a+b tensors
18589
- }
18590
- }
18572
+ return it->second;
18573
+ };
18591
18574
18592
18575
// bundle lora_a and lora_b into pairs
18593
18576
std::map<std::string, llama_lora_weight> ab_map;
@@ -18611,33 +18594,40 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
18611
18594
ab_map[name].b = cur;
18612
18595
}
18613
18596
} else {
18614
- // maybe "optimizer.*"" tensors
18615
- LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cur->name);
18597
+ gguf_free(ctx_gguf);
18598
+ ggml_free(ctx);
18599
+ throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
18616
18600
}
18617
18601
}
18618
18602
18619
18603
// add tensors
18620
18604
for (auto & it : ab_map) {
18621
- std::string name = it.first;
18622
- const char * cname = name.c_str();
18605
+ const std::string & name = it.first;
18623
18606
llama_lora_weight & w = it.second;
18624
- GGML_ASSERT(w.a != nullptr);
18625
- GGML_ASSERT(w.b != nullptr);
18626
- int il = -1;
18627
- sscanf(cname, "blk.%d.", &il);
18607
+
18608
+ if (!w.a || !w.b) {
18609
+ gguf_free(ctx_gguf);
18610
+ ggml_free(ctx);
18611
+ throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
18612
+ }
18613
+
18628
18614
// device buft and device ctx
18629
- auto model_tensor = llama_get_model_tensor(model, cname );
18615
+ auto * model_tensor = llama_get_model_tensor(model, name.c_str() );
18630
18616
if (!model_tensor) {
18631
18617
gguf_free(ctx_gguf);
18632
18618
ggml_free(ctx);
18633
18619
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
18634
18620
}
18635
- struct ggml_context * dev_ctx = ctx_map.at (ggml_backend_buffer_get_type(model_tensor->buffer));
18621
+ struct ggml_context * dev_ctx = get_ctx_for_buft (ggml_backend_buffer_get_type(model_tensor->buffer));
18636
18622
// validate tensor shape
18637
18623
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
18624
+ gguf_free(ctx_gguf);
18625
+ ggml_free(ctx);
18638
18626
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
18639
18627
}
18640
18628
if (w.a->ne[1] != w.b->ne[0]) {
18629
+ gguf_free(ctx_gguf);
18630
+ ggml_free(ctx);
18641
18631
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
18642
18632
}
18643
18633
// save tensor to adapter
@@ -18661,7 +18651,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
18661
18651
ggml_free(ctx);
18662
18652
throw std::runtime_error("failed to allocate buffer for lora adapter\n");
18663
18653
}
18664
- ggml_backend_buffer_clear( buf, 0);
18654
+ LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name( buf), ggml_backend_buffer_get_size(buf)/1024.0/1024. 0);
18665
18655
adapter.ctxs.push_back(ctx_dev);
18666
18656
adapter.bufs.push_back(buf);
18667
18657
}
@@ -18674,12 +18664,9 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
18674
18664
auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
18675
18665
size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name));
18676
18666
size_t size = ggml_nbytes(orig);
18677
- if (read_buf.size() < size) {
18678
- read_buf.resize(size);
18679
- }
18667
+ read_buf.resize(size);
18680
18668
gguf_file.seek(offs, SEEK_SET);
18681
18669
gguf_file.read_raw(read_buf.data(), size);
18682
- // LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, dev->name, size);
18683
18670
ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
18684
18671
};
18685
18672
for (auto & it : adapter.ab_map) {
0 commit comments