@@ -721,6 +721,11 @@ struct llama_model_loader {
721
721
case GGML_BACKEND_CUDA:
722
722
ggml_cuda_load_data (lt.data , lt.ggml_tensor );
723
723
break ;
724
+ #endif
725
+ #ifdef GGML_USE_CLBLAST
726
+ case GGML_BACKEND_CL:
727
+ ggml_cl_transform_tensor (lt.data , lt.ggml_tensor );
728
+ break ;
724
729
#endif
725
730
default :
726
731
continue ;
@@ -1006,8 +1011,10 @@ static void llama_model_load_internal(
1006
1011
}
1007
1012
}
1008
1013
1009
- #ifdef GGML_USE_CUBLAS
1014
+ #ifdef defined( GGML_USE_CUBLAS)
1010
1015
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
1016
+ #elif defined(GGML_USE_CLBLAST)
1017
+ #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CL
1011
1018
#else
1012
1019
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
1013
1020
#endif
@@ -1046,14 +1053,23 @@ static void llama_model_load_internal(
1046
1053
1047
1054
std::string layers_i = " layers." + std::to_string (i);
1048
1055
1049
- layer.attention_norm = ml->get_tensor (layers_i + " .attention_norm.weight" , {n_embd}, backend);
1056
+ // TODO: Normalize this after OpenCL supports mat mul with repeat
1057
+ if (backend == GGML_BACKEND_CUDA) {
1058
+ layer.attention_norm = ml->get_tensor (layers_i + " .attention_norm.weight" , {n_embd}, backend);
1059
+ } else {
1060
+ layer.attention_norm = ml->get_tensor (layers_i + " .attention_norm.weight" , {n_embd}, GGML_BACKEND_CPU);
1061
+ }
1050
1062
1051
1063
layer.wq = ml->get_tensor (layers_i + " .attention.wq.weight" , {n_embd, n_embd}, backend);
1052
1064
layer.wk = ml->get_tensor (layers_i + " .attention.wk.weight" , {n_embd, n_embd}, backend);
1053
1065
layer.wv = ml->get_tensor (layers_i + " .attention.wv.weight" , {n_embd, n_embd}, backend);
1054
1066
layer.wo = ml->get_tensor (layers_i + " .attention.wo.weight" , {n_embd, n_embd}, backend);
1055
1067
1056
- layer.ffn_norm = ml->get_tensor (layers_i + " .ffn_norm.weight" , {n_embd}, backend);
1068
+ if (backend == GGML_BACKEND_CUDA) {
1069
+ layer.ffn_norm = ml->get_tensor (layers_i + " .ffn_norm.weight" , {n_embd}, backend);
1070
+ } else {
1071
+ layer.ffn_norm = ml->get_tensor (layers_i + " .ffn_norm.weight" , {n_embd}, GGML_BACKEND_CPU);
1072
+ }
1057
1073
1058
1074
layer.w1 = ml->get_tensor (layers_i + " .feed_forward.w1.weight" , {n_embd, n_ff}, backend);
1059
1075
layer.w2 = ml->get_tensor (layers_i + " .feed_forward.w2.weight" , { n_ff, n_embd}, backend);
@@ -1064,6 +1080,12 @@ static void llama_model_load_internal(
1064
1080
ggml_nbytes (layer.attention_norm ) + ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk ) +
1065
1081
ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.ffn_norm ) +
1066
1082
ggml_nbytes (layer.w1 ) + ggml_nbytes (layer.w2 ) + ggml_nbytes (layer.w3 );
1083
+ } else if (backend == GGML_BACKEND_CL) {
1084
+ // TODO: Until OpenCL supports mat mul with repeat
1085
+ vram_total +=
1086
+ ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk ) +
1087
+ ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) +
1088
+ ggml_nbytes (layer.w1 ) + ggml_nbytes (layer.w2 ) + ggml_nbytes (layer.w3 );
1067
1089
}
1068
1090
}
1069
1091
}
@@ -1089,14 +1111,13 @@ static void llama_model_load_internal(
1089
1111
fprintf (stderr, " %s: mem required = %7.2f MB (+ %7.2f MB per state)\n " , __func__,
1090
1112
mem_required / 1024.0 / 1024.0 , mem_required_state / 1024.0 / 1024.0 );
1091
1113
1092
- #ifdef GGML_USE_CUBLAS
1114
+ #if defined( GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1093
1115
const int n_gpu = std::min (n_gpu_layers, int (hparams.n_layer ));
1094
-
1095
- fprintf (stderr, " %s: [cublas] offloading %d layers to GPU\n " , __func__, n_gpu);
1116
+ fprintf (stderr, " %s: offloading %d layers to GPU\n " , __func__, n_gpu);
1096
1117
if (n_gpu_layers > (int ) hparams.n_layer ) {
1097
- fprintf (stderr, " %s: [cublas] offloading output layer to GPU\n " , __func__);
1118
+ fprintf (stderr, " %s: offloading output layer to GPU\n " , __func__);
1098
1119
}
1099
- fprintf (stderr, " %s: [cublas] total VRAM used: %zu MB\n " , __func__, vram_total / 1024 / 1024 );
1120
+ fprintf (stderr, " %s: total VRAM used: %zu MB\n " , __func__, vram_total / 1024 / 1024 );
1100
1121
#elif !defined(GGML_USE_CLBLAST)
1101
1122
(void ) n_gpu_layers;
1102
1123
#endif
@@ -1109,34 +1130,6 @@ static void llama_model_load_internal(
1109
1130
1110
1131
ml->load_all_data (progress_callback, progress_callback_user_data, use_mlock ? &lctx.model .mlock_mmap : NULL );
1111
1132
1112
- #ifdef GGML_USE_CLBLAST
1113
- {
1114
- const int n_gpu = std::min (n_gpu_layers, int (hparams.n_layer ));
1115
-
1116
- fprintf (stderr, " ggml_opencl: offloading %d layers to GPU\n " , n_gpu);
1117
-
1118
- size_t vram_total = 0 ;
1119
-
1120
- for (int i = 0 ; i < n_gpu; ++i) {
1121
- const auto & layer = model.layers [i];
1122
-
1123
- ggml_cl_transform_tensor (layer.wq ); vram_total += ggml_nbytes (layer.wq );
1124
- ggml_cl_transform_tensor (layer.wk ); vram_total += ggml_nbytes (layer.wk );
1125
- ggml_cl_transform_tensor (layer.wv ); vram_total += ggml_nbytes (layer.wv );
1126
- ggml_cl_transform_tensor (layer.wo ); vram_total += ggml_nbytes (layer.wo );
1127
- ggml_cl_transform_tensor (layer.w1 ); vram_total += ggml_nbytes (layer.w1 );
1128
- ggml_cl_transform_tensor (layer.w2 ); vram_total += ggml_nbytes (layer.w2 );
1129
- ggml_cl_transform_tensor (layer.w3 ); vram_total += ggml_nbytes (layer.w3 );
1130
- }
1131
- if (n_gpu_layers > (int ) hparams.n_layer ) {
1132
- fprintf (stderr, " ggml_opencl: offloading output layer to GPU\n " );
1133
- ggml_cl_transform_tensor (model.output ); vram_total += ggml_nbytes (model.output );
1134
- }
1135
-
1136
- fprintf (stderr, " ggml_opencl: total VRAM used: %zu MB\n " , vram_total / 1024 / 1024 );
1137
- }
1138
- #endif
1139
-
1140
1133
if (progress_callback) {
1141
1134
progress_callback (1 .0f , progress_callback_user_data);
1142
1135
}
0 commit comments