|
4 | 4 | #include <atomic>
|
5 | 5 | #include <sstream>
|
6 | 6 | #include <vector>
|
| 7 | +#include <limits> |
7 | 8 |
|
8 | 9 | #define CL_TARGET_OPENCL_VERSION 110
|
9 | 10 | #include <clblast.h>
|
@@ -604,21 +605,44 @@ struct cl_buffer {
|
604 | 605 | static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
|
605 | 606 | static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
|
606 | 607 |
|
607 |
| -static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) { |
| 608 | +static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) { |
608 | 609 | scoped_spin_lock lock(g_cl_pool_lock);
|
609 | 610 | cl_int err;
|
610 | 611 |
|
| 612 | + int best_i = -1; |
| 613 | + size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs |
| 614 | + int worst_i = -1; |
| 615 | + size_t worst_size = 0; //largest unused buffer seen so far |
611 | 616 | for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
|
612 |
| - cl_buffer& b = g_cl_buffer_pool[i]; |
613 |
| - if (b.size > 0 && b.size >= size) { |
614 |
| - cl_mem mem = b.mem; |
615 |
| - *actual_size = b.size; |
616 |
| - b.size = 0; |
617 |
| - return mem; |
| 617 | + cl_buffer &b = g_cl_buffer_pool[i]; |
| 618 | + if (b.size > 0 && b.size >= size && b.size < best_size) |
| 619 | + { |
| 620 | + best_i = i; |
| 621 | + best_size = b.size; |
| 622 | + } |
| 623 | + if (b.size > 0 && b.size > worst_size) |
| 624 | + { |
| 625 | + worst_i = i; |
| 626 | + worst_size = b.size; |
618 | 627 | }
|
619 | 628 | }
|
| 629 | + if(best_i!=-1) //found the smallest buffer that fits our needs |
| 630 | + { |
| 631 | + cl_buffer& b = g_cl_buffer_pool[best_i]; |
| 632 | + cl_mem mem = b.mem; |
| 633 | + *actual_size = b.size; |
| 634 | + b.size = 0; |
| 635 | + return mem; |
| 636 | + } |
| 637 | + if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory |
| 638 | + { |
| 639 | + cl_buffer& b = g_cl_buffer_pool[worst_i]; |
| 640 | + cl_mem mem = b.mem; |
| 641 | + b.size = 0; |
| 642 | + clReleaseMemObject(mem); |
| 643 | + } |
620 | 644 | cl_mem mem;
|
621 |
| - CL_CHECK((mem = clCreateBuffer(context, flags, size, NULL, &err), err)); |
| 645 | + CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err)); |
622 | 646 | *actual_size = size;
|
623 | 647 | return mem;
|
624 | 648 | }
|
@@ -692,9 +716,10 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
692 | 716 | size_t x_size;
|
693 | 717 | size_t d_size;
|
694 | 718 |
|
695 |
| - cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size, CL_MEM_READ_ONLY); // src0 |
| 719 | + cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0 |
696 | 720 | cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
|
697 |
| - cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size, CL_MEM_WRITE_ONLY); // dst |
| 721 | + cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst |
| 722 | + |
698 | 723 |
|
699 | 724 | for (int64_t i03 = 0; i03 < ne03; i03++) {
|
700 | 725 | for (int64_t i02 = 0; i02 < ne02; i02++) {
|
@@ -792,10 +817,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
792 | 817 | if (src0->backend == GGML_BACKEND_CL) {
|
793 | 818 | d_X = (cl_mem) src0->data;
|
794 | 819 | } else {
|
795 |
| - d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY); |
| 820 | + d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size); |
796 | 821 | }
|
797 |
| - cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY); |
798 |
| - cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY); |
| 822 | + cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); |
| 823 | + cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); |
799 | 824 |
|
800 | 825 | for (int64_t i03 = 0; i03 < ne03; i03++) {
|
801 | 826 | for (int64_t i02 = 0; i02 < ne02; i02++) {
|
@@ -868,10 +893,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
868 | 893 | if (src0->backend == GGML_BACKEND_CL) {
|
869 | 894 | d_X = (cl_mem) src0->data;
|
870 | 895 | } else {
|
871 |
| - d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY); |
| 896 | + d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size); |
872 | 897 | }
|
873 |
| - cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY); |
874 |
| - cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY); |
| 898 | + cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size); |
| 899 | + cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size); |
875 | 900 |
|
876 | 901 | bool src1_cont_rows = nb10 == sizeof(float);
|
877 | 902 | bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
@@ -970,13 +995,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
970 | 995 | size_t q_size;
|
971 | 996 | cl_mem d_X;
|
972 | 997 | if (!mul_mat_vec) {
|
973 |
| - d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE); |
| 998 | + d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size); |
974 | 999 | }
|
975 |
| - cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY); |
976 |
| - cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY); |
| 1000 | + cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); |
| 1001 | + cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); |
977 | 1002 | cl_mem d_Q;
|
978 | 1003 | if (src0->backend == GGML_BACKEND_CPU) {
|
979 |
| - d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY); |
| 1004 | + d_Q = ggml_cl_pool_malloc(q_sz, &q_size); |
980 | 1005 | }
|
981 | 1006 |
|
982 | 1007 | cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
|
@@ -1143,7 +1168,7 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
|
1143 | 1168 | const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
1144 | 1169 |
|
1145 | 1170 | size_t q_size;
|
1146 |
| - cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY); |
| 1171 | + cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size); |
1147 | 1172 |
|
1148 | 1173 | // copy tensor to device
|
1149 | 1174 | for (int64_t i3 = 0; i3 < ne3; i3++) {
|
|
0 commit comments