@@ -2236,8 +2236,12 @@ struct llama_model_loader {
2236
2236
return gguf_get_tensor_name (ctx_gguf, i);
2237
2237
}
2238
2238
2239
+ struct ggml_tensor * get_tensor_meta (const char * name) const {
2240
+ return ggml_get_tensor (ctx_meta, name);
2241
+ }
2242
+
2239
2243
struct ggml_tensor * get_tensor_meta (int i) const {
2240
- return ggml_get_tensor (ctx_meta, get_tensor_name (i));
2244
+ return get_tensor_meta ( get_tensor_name (i));
2241
2245
}
2242
2246
2243
2247
struct ggml_tensor * create_tensor_for (struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
@@ -2302,7 +2306,7 @@ struct llama_model_loader {
2302
2306
return gguf_get_data_offset (ctx_gguf) + gguf_get_tensor_offset (ctx_gguf, idx);
2303
2307
}
2304
2308
2305
- void init_mapping () {
2309
+ void init_mapping (bool prefetch = true ) {
2306
2310
/*
2307
2311
// prefetch only CPU tensors
2308
2312
if (use_mmap) {
@@ -2320,17 +2324,19 @@ struct llama_model_loader {
2320
2324
*/
2321
2325
// prefetch the whole file - all the data is needed anyway
2322
2326
if (use_mmap) {
2323
- mapping.reset (new llama_mmap (&file, - 1 , ggml_is_numa ()));
2327
+ mapping.reset (new llama_mmap (&file, prefetch ? - 1 : 0 , ggml_is_numa ()));
2324
2328
}
2325
2329
}
2326
2330
2327
- // for backwards compatibility only
2331
+ // for backwards compatibility, does not support ggml-backend
2328
2332
void load_data_for (struct ggml_tensor * cur) const {
2329
2333
const size_t offs = file_offset (ggml_get_name (cur));
2330
2334
2331
- if (use_mmap) {
2335
+ if (use_mmap && mapping) {
2336
+ GGML_ASSERT (cur->data == nullptr );
2332
2337
cur->data = (uint8_t *)mapping->addr + offs;
2333
2338
} else {
2339
+ GGML_ASSERT (cur->data != nullptr );
2334
2340
file.seek (offs, SEEK_SET);
2335
2341
file.read_raw (cur->data , ggml_nbytes (cur));
2336
2342
}
@@ -8569,9 +8575,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8569
8575
#endif
8570
8576
8571
8577
llama_model_loader ml (fname_inp, use_mmap, NULL );
8572
- if (ml.use_mmap ) {
8573
- ml.mapping .reset (new llama_mmap (&ml.file , /* prefetch */ 0 , ggml_is_numa ()));
8574
- }
8578
+ ml.init_mapping (false ); // no prefetching?
8575
8579
8576
8580
llama_model model;
8577
8581
llm_load_arch (ml, model);
@@ -8650,8 +8654,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8650
8654
}
8651
8655
tensor->data = read_data.data ();
8652
8656
}
8653
- GGML_ASSERT (!" not implemented" );
8654
- // ml.load_data_for(tensor); TODO
8657
+ ml.load_data_for (tensor);
8655
8658
8656
8659
LLAMA_LOG_INFO (" [%4d/%4d] %36s - [%s], type = %6s, " ,
8657
8660
++idx, ml.n_tensors ,
@@ -8871,24 +8874,10 @@ static int llama_apply_lora_from_file_internal(
8871
8874
// load base model
8872
8875
std::unique_ptr<llama_model_loader> ml;
8873
8876
8874
- unique_context base_ctx (nullptr , ggml_free);
8875
- if (path_base_model) {
8877
+ if (path_base_model) {
8876
8878
LLAMA_LOG_INFO (" %s: loading base model from '%s'\n " , __func__, path_base_model);
8877
- ml.reset (new llama_model_loader (path_base_model, /* use_mmap*/ true , /* kv_overrides*/ NULL ));
8878
-
8879
- size_t ctx_size = ggml_tensor_overhead () * ml->n_tensors ;
8880
-
8881
- ggml_init_params base_params;
8882
- base_params.mem_size = ctx_size;
8883
- base_params.mem_buffer = NULL ;
8884
- base_params.no_alloc = true ;
8885
-
8886
- base_ctx.reset (ggml_init (base_params));
8887
-
8888
- // maybe this should be in llama_model_loader
8889
- if (ml->use_mmap ) {
8890
- ml->mapping .reset (new llama_mmap (&ml->file , /* prefetch */ 0 , ggml_is_numa ()));
8891
- }
8879
+ ml.reset (new llama_model_loader (path_base_model, /* use_mmap*/ true , /* kv_overrides*/ nullptr ));
8880
+ ml->init_mapping (false ); // no prefetching
8892
8881
}
8893
8882
8894
8883
// read tensors and apply
@@ -9001,9 +8990,8 @@ static int llama_apply_lora_from_file_internal(
9001
8990
return 1 ;
9002
8991
}
9003
8992
9004
- base_t = ml->create_tensor (base_ctx.get (), base_name, { dest_t ->ne [0 ], dest_t ->ne [1 ] }, GGML_BACKEND_CPU);
9005
- GGML_ASSERT (!" not implemented" );
9006
- // ml->load_data_for(base_t); // TODO
8993
+ base_t = ml->get_tensor_meta (base_name.c_str ());
8994
+ ml->load_data_for (base_t );
9007
8995
} else {
9008
8996
base_t = dest_t ;
9009
8997
}
0 commit comments