@@ -211,8 +211,9 @@ struct lora_merge_ctx {
211
211
}
212
212
}
213
213
214
- // if true, this tensor can be lora-merged. if false, we skip merging and just copy data to outfile
215
- std::vector<std::pair<struct ggml_tensor *, bool >> base_tensors;
214
+ // mapping base tensor to out tensor (same shape with base, but different type)
215
+ // if out_tensor == nullptr, we only copy it
216
+ std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
216
217
for (auto & it : base_model.tensors ) {
217
218
bool t_a = true ;
218
219
bool t_b = true ;
@@ -221,22 +222,22 @@ struct lora_merge_ctx {
221
222
t_b &= nullptr != adapter->get_tensor (it.first + " .lora_b" );
222
223
}
223
224
auto base_tensor = it.second ;
224
- struct ggml_tensor * out_tensor;
225
225
if (!t_a && !t_b) {
226
226
// only copy
227
- out_tensor = ggml_dup_tensor (ctx_out_ggml, base_tensor);
228
- ggml_set_name (out_tensor, base_tensor->name );
229
- base_tensors.push_back (std::make_pair (out_tensor, false ));
227
+ struct ggml_tensor * cpy_tensor = ggml_dup_tensor (ctx_out_ggml, base_tensor);
228
+ ggml_set_name (cpy_tensor, base_tensor->name );
229
+ base_to_out_tensors.push_back (std::make_pair (cpy_tensor, nullptr ));
230
+ gguf_add_tensor (ctx_out, cpy_tensor);
230
231
} else if (t_a && t_b) {
231
232
// need merging
232
- out_tensor = ggml_dup_tensor (ctx_out_ggml, base_tensor);
233
- out_tensor-> type = get_out_tensor_type (base_tensor);
233
+ struct ggml_tensor * out_tensor = ggml_new_tensor (
234
+ ctx_out_ggml, get_out_tensor_type (base_tensor), GGML_MAX_DIMS, base_tensor-> ne );
234
235
ggml_set_name (out_tensor, base_tensor->name );
235
- base_tensors.push_back (std::make_pair (out_tensor, true ));
236
+ base_to_out_tensors.push_back (std::make_pair (base_tensor, out_tensor));
237
+ gguf_add_tensor (ctx_out, out_tensor);
236
238
} else {
237
239
throw std::runtime_error (" tensor " + it.first + " missing either lora_a or lora_b" );
238
240
}
239
- gguf_add_tensor (ctx_out, out_tensor);
240
241
}
241
242
242
243
// placeholder for the meta data
@@ -247,9 +248,9 @@ struct lora_merge_ctx {
247
248
248
249
// process base model tensors
249
250
size_t n_merged = 0 ;
250
- for (auto & it : base_tensors ) {
251
- if (it.second ) {
252
- merge_tensor (it.first );
251
+ for (auto & it : base_to_out_tensors ) {
252
+ if (it.second != nullptr ) {
253
+ merge_tensor (it.first , it. second );
253
254
n_merged++;
254
255
} else {
255
256
copy_tensor (it.first );
@@ -265,7 +266,7 @@ struct lora_merge_ctx {
265
266
}
266
267
267
268
printf (" %s : merged %ld tensors with lora adapters\n " , __func__, n_merged);
268
- printf (" %s : wrote %ld tensors to output file\n " , __func__, base_tensors .size ());
269
+ printf (" %s : wrote %ld tensors to output file\n " , __func__, base_to_out_tensors .size ());
269
270
}
270
271
271
272
void copy_tensor (struct ggml_tensor * base) {
@@ -276,7 +277,7 @@ struct lora_merge_ctx {
276
277
zeros (fout, GGML_PAD (len, GGUF_DEFAULT_ALIGNMENT) - len);
277
278
}
278
279
279
- void merge_tensor (struct ggml_tensor * base) {
280
+ void merge_tensor (struct ggml_tensor * base, struct ggml_tensor * out ) {
280
281
std::string name_base (base->name );
281
282
std::string name_lora_a = name_base + " .lora_a" ;
282
283
std::string name_lora_b = name_base + " .lora_b" ;
@@ -287,14 +288,14 @@ struct lora_merge_ctx {
287
288
std::vector<struct ggml_tensor *> inp_a (adapters.size ());
288
289
std::vector<struct ggml_tensor *> inp_b (adapters.size ());
289
290
struct ggml_init_params params {
290
- /* .mem_size =*/ ggml_tensor_overhead()*(1 +adapters.size()*2 ),
291
+ /* .mem_size =*/ ggml_tensor_overhead()*(2 +adapters.size()*2 ),
291
292
/* .mem_buffer =*/ NULL ,
292
293
/* .no_alloc =*/ true ,
293
294
};
294
295
struct ggml_context * ctx = ggml_init (params);
295
296
296
297
// alloc tensors
297
- struct ggml_tensor * inp = ggml_dup_tensor (ctx, base);
298
+ struct ggml_tensor * inp_base = ggml_new_tensor (ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base-> ne );
298
299
for (size_t i = 0 ; i < adapters.size (); ++i) {
299
300
auto t_a = adapters[i]->get_tensor (name_lora_a);
300
301
auto t_b = adapters[i]->get_tensor (name_lora_b);
@@ -303,9 +304,21 @@ struct lora_merge_ctx {
303
304
}
304
305
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors (ctx, backend);
305
306
306
- // load data to backend buffer
307
+ // load base tensor to backend buffer
307
308
base_model.read_tensor_data (name_base, read_buf);
308
- ggml_backend_tensor_set (inp, read_buf.data (), 0 , ggml_nbytes (inp));
309
+ if (base->type != GGML_TYPE_F32) {
310
+ // optionally dequantize it
311
+ printf (" %s : + dequantize base tensor from %s to F32\n " , __func__, ggml_type_name (base->type ));
312
+ auto nels = ggml_nelements (inp_base);
313
+ ggml_type_traits_t qtype = ggml_internal_get_type_traits (base->type );
314
+ std::vector<uint8_t > dequant_buf (nels * sizeof (float ));
315
+ qtype.to_float (read_buf.data (), (float *)dequant_buf.data (), nels);
316
+ ggml_backend_tensor_set (inp_base, dequant_buf.data (), 0 , dequant_buf.size ());
317
+ } else {
318
+ ggml_backend_tensor_set (inp_base, read_buf.data (), 0 , ggml_nbytes (inp_base));
319
+ }
320
+
321
+ // load lora tensors to backend buffer
309
322
for (size_t i = 0 ; i < adapters.size (); ++i) {
310
323
adapters[i]->read_tensor_data (name_lora_a, read_buf);
311
324
ggml_backend_tensor_set (inp_a[i], read_buf.data (), 0 , ggml_nbytes (inp_a[i]));
@@ -325,20 +338,21 @@ struct lora_merge_ctx {
325
338
};
326
339
struct ggml_context * ctx0 = ggml_init (params0);
327
340
gf = ggml_new_graph (ctx0);
328
- struct ggml_tensor * cur = inp ;
341
+ struct ggml_tensor * cur = inp_base ;
329
342
for (size_t i = 0 ; i < adapters.size (); ++i) {
330
- struct ggml_tensor * a_T = ggml_cont (ctx0, ggml_transpose (ctx0, inp_a[i]));
331
- struct ggml_tensor * delta = ggml_mul_mat (ctx0, a_T, inp_b[i]);
343
+ struct ggml_tensor * a_T = ggml_cont (ctx0, ggml_transpose (ctx0, ggml_cast (ctx0, inp_a[i], GGML_TYPE_F32) ));
344
+ struct ggml_tensor * delta = ggml_mul_mat (ctx0, a_T, ggml_cast (ctx0, inp_b[i], GGML_TYPE_F32) );
332
345
// scale
333
346
const float alpha = adapters[i]->alpha ;
334
347
const float rank = (float ) inp_b[i]->ne [0 ];
335
348
const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale ;
336
349
delta = ggml_scale (ctx0, delta, scale);
337
- cur = ggml_add (ctx0, cur, delta );
338
- printf (" %s : + merging from adapter[%ld]\n " , __func__, i);
350
+ cur = ggml_add (ctx0, delta, cur );
351
+ printf (" %s : + merging from adapter[%ld] type=%s \n " , __func__, i, ggml_type_name (inp_a[i]-> type ) );
339
352
printf (" %s : input_scale=%f calculated_scale=%f rank=%d\n " , __func__, adapters[i]->scale , scale, (int ) inp_b[i]->ne [0 ]);
340
353
}
341
- cur = ggml_cast (ctx0, cur, get_out_tensor_type (base));
354
+ cur = ggml_cast (ctx0, cur, out->type );
355
+ printf (" %s : + output type is %s\n " , __func__, ggml_type_name (out->type ));
342
356
ggml_build_forward_expand (gf, cur);
343
357
ggml_free (ctx0);
344
358
}
0 commit comments