Skip to content

Commit 34b283d

Browse files
committed
Normalize OpenCL loading code as CUDA
1 parent 3be3ba3 commit 34b283d

File tree

3 files changed

+35
-39
lines changed

3 files changed

+35
-39
lines changed

ggml-opencl.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#define CL_TARGET_OPENCL_VERSION 110
88
#include <clblast.h>
99

10+
#include <assert.h>
1011
#include <stdlib.h>
1112
#include <stdio.h>
1213
#include <string.h>
@@ -1006,7 +1007,7 @@ size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct g
10061007
return 0;
10071008
}
10081009

1009-
void ggml_cl_transform_tensor(ggml_tensor * tensor) {
1010+
void ggml_cl_transform_tensor(const void * data, ggml_tensor * tensor) {
10101011
const int64_t ne0 = tensor->ne[0];
10111012
const int64_t ne1 = tensor->ne[1];
10121013
const int64_t ne2 = tensor->ne[2];
@@ -1019,6 +1020,8 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
10191020
cl_mem* dst = (cl_mem*) malloc(sizeof(cl_mem));
10201021
*dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
10211022

1023+
tensor->data = (void*)data;
1024+
10221025
// copy tensor to device
10231026
for (int64_t i3 = 0; i3 < ne3; i3++) {
10241027
for (int64_t i2 = 0; i2 < ne2; i2++) {
@@ -1030,5 +1033,5 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
10301033
CL_CHECK(clFinish(queue));
10311034

10321035
tensor->data = dst;
1033-
tensor->backend = GGML_BACKEND_CL;
1036+
assert(tensor->backend == GGML_BACKEND_CL);
10341037
}

ggml-opencl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor
1515
void * ggml_cl_host_malloc(size_t size);
1616
void ggml_cl_host_free(void * ptr);
1717

18-
void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
18+
void ggml_cl_transform_tensor(const void * data, struct ggml_tensor * tensor);
1919

2020
#ifdef __cplusplus
2121
}

llama.cpp

Lines changed: 29 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -721,6 +721,11 @@ struct llama_model_loader {
721721
case GGML_BACKEND_CUDA:
722722
ggml_cuda_load_data(lt.data, lt.ggml_tensor);
723723
break;
724+
#endif
725+
#ifdef GGML_USE_CLBLAST
726+
case GGML_BACKEND_CL:
727+
ggml_cl_transform_tensor(lt.data, lt.ggml_tensor);
728+
break;
724729
#endif
725730
default:
726731
continue;
@@ -1006,8 +1011,10 @@ static void llama_model_load_internal(
10061011
}
10071012
}
10081013

1009-
#ifdef GGML_USE_CUBLAS
1014+
#ifdef defined(GGML_USE_CUBLAS)
10101015
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
1016+
#elif defined(GGML_USE_CLBLAST)
1017+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CL
10111018
#else
10121019
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
10131020
#endif
@@ -1046,14 +1053,23 @@ static void llama_model_load_internal(
10461053

10471054
std::string layers_i = "layers." + std::to_string(i);
10481055

1049-
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1056+
// TODO: Normalize this after OpenCL supports mat mul with repeat
1057+
if (backend == GGML_BACKEND_CUDA) {
1058+
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1059+
} else {
1060+
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, GGML_BACKEND_CPU);
1061+
}
10501062

10511063
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
10521064
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
10531065
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
10541066
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
10551067

1056-
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
1068+
if (backend == GGML_BACKEND_CUDA) {
1069+
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
1070+
} else {
1071+
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, GGML_BACKEND_CPU);
1072+
}
10571073

10581074
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
10591075
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
@@ -1064,6 +1080,12 @@ static void llama_model_load_internal(
10641080
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
10651081
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
10661082
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
1083+
} else if (backend == GGML_BACKEND_CL) {
1084+
// TODO: Until OpenCL supports mat mul with repeat
1085+
vram_total +=
1086+
ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
1087+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) +
1088+
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
10671089
}
10681090
}
10691091
}
@@ -1089,14 +1111,13 @@ static void llama_model_load_internal(
10891111
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
10901112
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
10911113

1092-
#ifdef GGML_USE_CUBLAS
1114+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
10931115
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1094-
1095-
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1116+
fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
10961117
if (n_gpu_layers > (int) hparams.n_layer) {
1097-
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1118+
fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
10981119
}
1099-
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1120+
fprintf(stderr, "%s: total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
11001121
#elif !defined(GGML_USE_CLBLAST)
11011122
(void) n_gpu_layers;
11021123
#endif
@@ -1109,34 +1130,6 @@ static void llama_model_load_internal(
11091130

11101131
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
11111132

1112-
#ifdef GGML_USE_CLBLAST
1113-
{
1114-
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1115-
1116-
fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
1117-
1118-
size_t vram_total = 0;
1119-
1120-
for (int i = 0; i < n_gpu; ++i) {
1121-
const auto & layer = model.layers[i];
1122-
1123-
ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1124-
ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1125-
ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1126-
ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1127-
ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1128-
ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1129-
ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1130-
}
1131-
if (n_gpu_layers > (int) hparams.n_layer) {
1132-
fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
1133-
ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1134-
}
1135-
1136-
fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
1137-
}
1138-
#endif
1139-
11401133
if (progress_callback) {
11411134
progress_callback(1.0f, progress_callback_user_data);
11421135
}

0 commit comments

Comments
 (0)