@@ -700,28 +700,31 @@ struct llama_model_loader {
700
700
701
701
if (use_mmap) {
702
702
mapping.reset (new llama_mmap (&file_loaders.at (0 )->file , prefetch_size));
703
- if (!lmlock) {
704
- // Don't call the callback since the actual loading will be lazy
705
- // and we can't measure it.
706
- progress_callback = NULL ;
707
- }
708
703
if (lmlock) {
709
704
lmlock->init (mapping->addr );
710
705
}
711
706
}
712
707
713
708
size_t done_size = 0 ;
714
709
for (llama_load_tensor & lt : tensors_map.tensors ) {
715
- if (lt.ggml_tensor ->backend != GGML_BACKEND_CPU) {
716
- continue ;
717
- }
718
710
if (progress_callback) {
719
711
progress_callback ((float ) done_size / data_size, progress_callback_user_data);
720
712
}
721
713
LLAMA_ASSERT (lt.ggml_tensor ); // unused tensors should have been caught by load_data already
722
714
lt.data = (uint8_t *) lt.ggml_tensor ->data ;
723
715
load_data_for (lt);
724
- lt.ggml_tensor ->data = lt.data ;
716
+ switch (lt.ggml_tensor ->backend ) {
717
+ case GGML_BACKEND_CPU:
718
+ lt.ggml_tensor ->data = lt.data ;
719
+ break ;
720
+ #ifdef GGML_USE_CUBLAS
721
+ case GGML_BACKEND_CUDA:
722
+ ggml_cuda_load_data (lt.data , lt.ggml_tensor );
723
+ break ;
724
+ #endif
725
+ default :
726
+ continue ;
727
+ }
725
728
done_size += lt.size ;
726
729
if (use_mmap && lmlock) {
727
730
lmlock->grow_to (done_size);
@@ -1106,28 +1109,7 @@ static void llama_model_load_internal(
1106
1109
1107
1110
ml->load_all_data (progress_callback, progress_callback_user_data, use_mlock ? &lctx.model .mlock_mmap : NULL );
1108
1111
1109
- #ifdef GGML_USE_CUBLAS
1110
- {
1111
- size_t done_size = 0 ;
1112
- size_t data_size = 0 ;
1113
- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1114
- data_size += lt.size ;
1115
- if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
1116
- done_size += lt.size ;
1117
- }
1118
- }
1119
- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1120
- if (lt.ggml_tensor ->backend != GGML_BACKEND_CUDA) {
1121
- continue ;
1122
- }
1123
- if (progress_callback) {
1124
- progress_callback ((float ) done_size / data_size, progress_callback_user_data);
1125
- }
1126
- ggml_cuda_load_data (fname.c_str (), lt.ggml_tensor , lt.shards .at (0 ).file_off );
1127
- done_size += lt.size ;
1128
- }
1129
- }
1130
- #elif defined(GGML_USE_CLBLAST)
1112
+ #ifdef GGML_USE_CLBLAST
1131
1113
{
1132
1114
const int n_gpu = std::min (n_gpu_layers, int (hparams.n_layer ));
1133
1115
0 commit comments