Skip to content

Commit 41cd47c

Browse files
authored
examples : export-lora : fix issue with quantized base models (#8687)
1 parent 49ce0ab commit 41cd47c

File tree

1 file changed

+39
-25
lines changed

1 file changed

+39
-25
lines changed

examples/export-lora/export-lora.cpp

Lines changed: 39 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -211,8 +211,9 @@ struct lora_merge_ctx {
211211
}
212212
}
213213

214-
// if true, this tensor can be lora-merged. if false, we skip merging and just copy data to outfile
215-
std::vector<std::pair<struct ggml_tensor *, bool>> base_tensors;
214+
// mapping base tensor to out tensor (same shape with base, but different type)
215+
// if out_tensor == nullptr, we only copy it
216+
std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
216217
for (auto & it : base_model.tensors) {
217218
bool t_a = true;
218219
bool t_b = true;
@@ -221,22 +222,22 @@ struct lora_merge_ctx {
221222
t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b");
222223
}
223224
auto base_tensor = it.second;
224-
struct ggml_tensor * out_tensor;
225225
if (!t_a && !t_b) {
226226
// only copy
227-
out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
228-
ggml_set_name(out_tensor, base_tensor->name);
229-
base_tensors.push_back(std::make_pair(out_tensor, false));
227+
struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
228+
ggml_set_name(cpy_tensor, base_tensor->name);
229+
base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr));
230+
gguf_add_tensor(ctx_out, cpy_tensor);
230231
} else if (t_a && t_b) {
231232
// need merging
232-
out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
233-
out_tensor->type = get_out_tensor_type(base_tensor);
233+
struct ggml_tensor * out_tensor = ggml_new_tensor(
234+
ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
234235
ggml_set_name(out_tensor, base_tensor->name);
235-
base_tensors.push_back(std::make_pair(out_tensor, true));
236+
base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor));
237+
gguf_add_tensor(ctx_out, out_tensor);
236238
} else {
237239
throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
238240
}
239-
gguf_add_tensor(ctx_out, out_tensor);
240241
}
241242

242243
// placeholder for the meta data
@@ -247,9 +248,9 @@ struct lora_merge_ctx {
247248

248249
// process base model tensors
249250
size_t n_merged = 0;
250-
for (auto & it : base_tensors) {
251-
if (it.second) {
252-
merge_tensor(it.first);
251+
for (auto & it : base_to_out_tensors) {
252+
if (it.second != nullptr) {
253+
merge_tensor(it.first, it.second);
253254
n_merged++;
254255
} else {
255256
copy_tensor(it.first);
@@ -265,7 +266,7 @@ struct lora_merge_ctx {
265266
}
266267

267268
printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
268-
printf("%s : wrote %ld tensors to output file\n", __func__, base_tensors.size());
269+
printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size());
269270
}
270271

271272
void copy_tensor(struct ggml_tensor * base) {
@@ -276,7 +277,7 @@ struct lora_merge_ctx {
276277
zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
277278
}
278279

279-
void merge_tensor(struct ggml_tensor * base) {
280+
void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) {
280281
std::string name_base(base->name);
281282
std::string name_lora_a = name_base + ".lora_a";
282283
std::string name_lora_b = name_base + ".lora_b";
@@ -287,14 +288,14 @@ struct lora_merge_ctx {
287288
std::vector<struct ggml_tensor *> inp_a(adapters.size());
288289
std::vector<struct ggml_tensor *> inp_b(adapters.size());
289290
struct ggml_init_params params {
290-
/*.mem_size =*/ ggml_tensor_overhead()*(1+adapters.size()*2),
291+
/*.mem_size =*/ ggml_tensor_overhead()*(2+adapters.size()*2),
291292
/*.mem_buffer =*/ NULL,
292293
/*.no_alloc =*/ true,
293294
};
294295
struct ggml_context * ctx = ggml_init(params);
295296

296297
// alloc tensors
297-
struct ggml_tensor * inp = ggml_dup_tensor(ctx, base);
298+
struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne);
298299
for (size_t i = 0; i < adapters.size(); ++i) {
299300
auto t_a = adapters[i]->get_tensor(name_lora_a);
300301
auto t_b = adapters[i]->get_tensor(name_lora_b);
@@ -303,9 +304,21 @@ struct lora_merge_ctx {
303304
}
304305
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
305306

306-
// load data to backend buffer
307+
// load base tensor to backend buffer
307308
base_model.read_tensor_data(name_base, read_buf);
308-
ggml_backend_tensor_set(inp, read_buf.data(), 0, ggml_nbytes(inp));
309+
if (base->type != GGML_TYPE_F32) {
310+
// optionally dequantize it
311+
printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
312+
auto nels = ggml_nelements(inp_base);
313+
ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
314+
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
315+
qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
316+
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
317+
} else {
318+
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
319+
}
320+
321+
// load lora tensors to backend buffer
309322
for (size_t i = 0; i < adapters.size(); ++i) {
310323
adapters[i]->read_tensor_data(name_lora_a, read_buf);
311324
ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i]));
@@ -325,20 +338,21 @@ struct lora_merge_ctx {
325338
};
326339
struct ggml_context * ctx0 = ggml_init(params0);
327340
gf = ggml_new_graph(ctx0);
328-
struct ggml_tensor * cur = inp;
341+
struct ggml_tensor * cur = inp_base;
329342
for (size_t i = 0; i < adapters.size(); ++i) {
330-
struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, inp_a[i]));
331-
struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, inp_b[i]);
343+
struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
344+
struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
332345
// scale
333346
const float alpha = adapters[i]->alpha;
334347
const float rank = (float) inp_b[i]->ne[0];
335348
const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
336349
delta = ggml_scale(ctx0, delta, scale);
337-
cur = ggml_add(ctx0, cur, delta);
338-
printf("%s : + merging from adapter[%ld]\n", __func__, i);
350+
cur = ggml_add(ctx0, delta, cur);
351+
printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
339352
printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
340353
}
341-
cur = ggml_cast(ctx0, cur, get_out_tensor_type(base));
354+
cur = ggml_cast(ctx0, cur, out->type);
355+
printf("%s : + output type is %s\n", __func__, ggml_type_name(out->type));
342356
ggml_build_forward_expand(gf, cur);
343357
ggml_free(ctx0);
344358
}

0 commit comments

Comments
 (0)