Skip to content

Commit d5b111f

Browse files
authored
Clblast fixes + enhancements to save VRAM and offload more layers (#1675)
* Use events instead of clFinish, where possible * OpenCL: Don't load gpu layers into RAM, add mul_f32 kernel * Reduce queueing overhead for contiguous tensors by using single mul kernel call * Adapt to #1612 cl_mem malloc changes * Reduce code duplication between cuda and opencl branches * Improve implementation * Clblast fixes + enhancements to save VRAM: 1. Change all Clblast buffers to CL_MEM_READ_WRITE, as the pool malloc currently doesn't properly handle them. 2. When recycling buffers in pool malloc, always assign the SMALLEST available buffer that fits, instead of the FIRST available buffer 3. When failing to recycle a buffer in pool malloc (all too small), instead recycle the largest available free buffer by resizing it. * change max value size_t to use limits * removed flags from the CL pool malloc, apply code tidying suggestions.
1 parent 2d43387 commit d5b111f

File tree

1 file changed

+46
-21
lines changed

1 file changed

+46
-21
lines changed

ggml-opencl.cpp

Lines changed: 46 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <atomic>
55
#include <sstream>
66
#include <vector>
7+
#include <limits>
78

89
#define CL_TARGET_OPENCL_VERSION 110
910
#include <clblast.h>
@@ -604,21 +605,44 @@ struct cl_buffer {
604605
static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
605606
static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
606607

607-
static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) {
608+
static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) {
608609
scoped_spin_lock lock(g_cl_pool_lock);
609610
cl_int err;
610611

612+
int best_i = -1;
613+
size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
614+
int worst_i = -1;
615+
size_t worst_size = 0; //largest unused buffer seen so far
611616
for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
612-
cl_buffer& b = g_cl_buffer_pool[i];
613-
if (b.size > 0 && b.size >= size) {
614-
cl_mem mem = b.mem;
615-
*actual_size = b.size;
616-
b.size = 0;
617-
return mem;
617+
cl_buffer &b = g_cl_buffer_pool[i];
618+
if (b.size > 0 && b.size >= size && b.size < best_size)
619+
{
620+
best_i = i;
621+
best_size = b.size;
622+
}
623+
if (b.size > 0 && b.size > worst_size)
624+
{
625+
worst_i = i;
626+
worst_size = b.size;
618627
}
619628
}
629+
if(best_i!=-1) //found the smallest buffer that fits our needs
630+
{
631+
cl_buffer& b = g_cl_buffer_pool[best_i];
632+
cl_mem mem = b.mem;
633+
*actual_size = b.size;
634+
b.size = 0;
635+
return mem;
636+
}
637+
if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
638+
{
639+
cl_buffer& b = g_cl_buffer_pool[worst_i];
640+
cl_mem mem = b.mem;
641+
b.size = 0;
642+
clReleaseMemObject(mem);
643+
}
620644
cl_mem mem;
621-
CL_CHECK((mem = clCreateBuffer(context, flags, size, NULL, &err), err));
645+
CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
622646
*actual_size = size;
623647
return mem;
624648
}
@@ -692,9 +716,10 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
692716
size_t x_size;
693717
size_t d_size;
694718

695-
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size, CL_MEM_READ_ONLY); // src0
719+
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
696720
cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
697-
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size, CL_MEM_WRITE_ONLY); // dst
721+
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
722+
698723

699724
for (int64_t i03 = 0; i03 < ne03; i03++) {
700725
for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -792,10 +817,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
792817
if (src0->backend == GGML_BACKEND_CL) {
793818
d_X = (cl_mem) src0->data;
794819
} else {
795-
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
820+
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
796821
}
797-
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
798-
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
822+
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
823+
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
799824

800825
for (int64_t i03 = 0; i03 < ne03; i03++) {
801826
for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -868,10 +893,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
868893
if (src0->backend == GGML_BACKEND_CL) {
869894
d_X = (cl_mem) src0->data;
870895
} else {
871-
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
896+
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
872897
}
873-
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY);
874-
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
898+
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size);
899+
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size);
875900

876901
bool src1_cont_rows = nb10 == sizeof(float);
877902
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
@@ -970,13 +995,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
970995
size_t q_size;
971996
cl_mem d_X;
972997
if (!mul_mat_vec) {
973-
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE);
998+
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
974999
}
975-
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
976-
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
1000+
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1001+
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
9771002
cl_mem d_Q;
9781003
if (src0->backend == GGML_BACKEND_CPU) {
979-
d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
1004+
d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
9801005
}
9811006

9821007
cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
@@ -1143,7 +1168,7 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
11431168
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
11441169

11451170
size_t q_size;
1146-
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
1171+
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
11471172

11481173
// copy tensor to device
11491174
for (int64_t i3 = 0; i3 < ne3; i3++) {

0 commit comments

Comments
 (0)