Normalize OpenCL loading code as CUDA

howard0su · howard0su · commit 34b283d7cc6b · 2023-05-27T21:29:36.000+08:00
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
@@ -7,6 +7,7 @@
 #define CL_TARGET_OPENCL_VERSION 110
 #include <clblast.h>
 
+#include <assert.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@@ -1006,7 +1007,7 @@ size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct g
     return 0;
 }
 
-void ggml_cl_transform_tensor(ggml_tensor * tensor) {
+void ggml_cl_transform_tensor(const void * data, ggml_tensor * tensor) {
     const int64_t ne0 = tensor->ne[0];
     const int64_t ne1 = tensor->ne[1];
     const int64_t ne2 = tensor->ne[2];
@@ -1019,6 +1020,8 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
     cl_mem* dst = (cl_mem*) malloc(sizeof(cl_mem));
     *dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
 
+    tensor->data = (void*)data;
+
     // copy tensor to device
     for (int64_t i3 = 0; i3 < ne3; i3++) {
         for (int64_t i2 = 0; i2 < ne2; i2++) {
@@ -1030,5 +1033,5 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
     CL_CHECK(clFinish(queue));
 
     tensor->data = dst;
-    tensor->backend = GGML_BACKEND_CL;
+    assert(tensor->backend == GGML_BACKEND_CL);
 }
diff --git a/ggml-opencl.h b/ggml-opencl.h
@@ -15,7 +15,7 @@ void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor
 void * ggml_cl_host_malloc(size_t size);
 void   ggml_cl_host_free(void * ptr);
 
-void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
+void ggml_cl_transform_tensor(const void * data, struct ggml_tensor * tensor);
 
 #ifdef  __cplusplus
 }
diff --git a/llama.cpp b/llama.cpp
@@ -721,6 +721,11 @@ struct llama_model_loader {
                 case GGML_BACKEND_CUDA:
                     ggml_cuda_load_data(lt.data, lt.ggml_tensor);
                     break;
+#endif
+#ifdef GGML_USE_CLBLAST
+                case GGML_BACKEND_CL:
+                    ggml_cl_transform_tensor(lt.data, lt.ggml_tensor);
+                    break;
 #endif
                 default:
                     continue;
@@ -1006,8 +1011,10 @@ static void llama_model_load_internal(
         }
     }
 
-#ifdef GGML_USE_CUBLAS
+#ifdef defined(GGML_USE_CUBLAS)
 #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
+#elif defined(GGML_USE_CLBLAST)
+#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CL
 #else
 #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
 #endif
@@ -1046,14 +1053,23 @@ static void llama_model_load_internal(
 
             std::string layers_i = "layers." + std::to_string(i);
 
-            layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
+            // TODO: Normalize this after OpenCL supports mat mul with repeat
+            if (backend == GGML_BACKEND_CUDA) {
+                layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
+            } else {
+                layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, GGML_BACKEND_CPU);
+            }
 
             layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
             layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
             layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
             layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
 
-            layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
+            if (backend == GGML_BACKEND_CUDA) {
+                layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
+            } else {
+                layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, GGML_BACKEND_CPU);
+            }
 
             layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff},   backend);
             layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd}, backend);
@@ -1064,6 +1080,12 @@ static void llama_model_load_internal(
                     ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk)       +
                     ggml_nbytes(layer.wv)             + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
                     ggml_nbytes(layer.w1)             + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
+            } else if (backend == GGML_BACKEND_CL) {
+                // TODO: Until OpenCL supports mat mul with repeat
+                vram_total +=
+                    ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
+                    ggml_nbytes(layer.wv)             + ggml_nbytes(layer.wo) +
+                    ggml_nbytes(layer.w1)             + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
             }
         }
     }
@@ -1089,14 +1111,13 @@ static void llama_model_load_internal(
         fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
                 mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
 
-#ifdef GGML_USE_CUBLAS
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
         const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
-
-        fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
+        fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
         if (n_gpu_layers > (int) hparams.n_layer) {
-            fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
+            fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
         }
-        fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+        fprintf(stderr, "%s: total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
 #elif !defined(GGML_USE_CLBLAST)
         (void) n_gpu_layers;
 #endif
@@ -1109,34 +1130,6 @@ static void llama_model_load_internal(
 
     ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
 
-#ifdef GGML_USE_CLBLAST
-    {
-        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
-
-        fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
-
-        size_t vram_total = 0;
-
-        for (int i = 0; i < n_gpu; ++i) {
-            const auto & layer = model.layers[i];
-
-            ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
-            ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
-            ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
-            ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
-            ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
-            ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
-            ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
-        }
-        if (n_gpu_layers > (int) hparams.n_layer) {
-            fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
-            ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
-        }
-
-        fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
-    }
-#endif
-
     if (progress_callback) {
         progress_callback(1.0f, progress_callback_user_data);
     }

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor`
`15`	`15`	`void * ggml_cl_host_malloc(size_t size);`
`16`	`16`	`void ggml_cl_host_free(void * ptr);`
`17`	`17`
`18`		`-void ggml_cl_transform_tensor(struct ggml_tensor * tensor);`
	`18`	`+void ggml_cl_transform_tensor(const void * data, struct ggml_tensor * tensor);`
`19`	`19`
`20`	`20`	`#ifdef __cplusplus`
`21`	`21`	`}`