Skip to content

Commit 2b0c6a5

Browse files
0cc4mSlyEcho
andcommitted
Improve code quality
* Move internal stuff out of header * Use internal enums instead of CLBlast enums * Remove leftover C++ includes and defines * Make event use easier to read Co-authored-by: Henri Vasserman <[email protected]>
1 parent 1370710 commit 2b0c6a5

File tree

5 files changed

+58
-47
lines changed

5 files changed

+58
-47
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ if (LLAMA_CLBLAST)
174174
if (CLBlast_FOUND)
175175
message(STATUS "CLBlast found")
176176

177-
set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h)
177+
set(GGML_OPENCL_SOURCES ggml-opencl.c ggml-opencl.h)
178178

179179
add_compile_definitions(GGML_USE_CLBLAST)
180180

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ ifdef LLAMA_CLBLAST
117117
CFLAGS += -DGGML_USE_CLBLAST
118118
LDFLAGS += -lclblast -lOpenCL
119119
OBJS += ggml-opencl.o
120-
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
120+
ggml-opencl.o: ggml-opencl.c ggml-opencl.h
121121
$(CXX) $(CXXFLAGS) -c $< -o $@
122122
endif
123123
ifdef LLAMA_GPROF

ggml-opencl.cpp renamed to ggml-opencl.c

Lines changed: 42 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,24 @@
11
#include "ggml-opencl.h"
22

3-
#include <atomic>
4-
#include <cstdio>
5-
#include <cstring>
3+
#define CL_TARGET_OPENCL_VERSION 110
4+
#include <clblast_c.h>
5+
6+
#include <stdio.h>
7+
#include <string.h>
68

79
#include "ggml.h"
810

911
#include <ggml_clblast_dequant.cl>
1012

13+
#define CL_CHECK(err, name) \
14+
do { \
15+
cl_int err_ = (err); \
16+
if (err_ != CL_SUCCESS) { \
17+
fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
18+
exit(1); \
19+
} \
20+
} while (0)
21+
1122
cl_platform_id platform;
1223
cl_device_id device;
1324
cl_context context;
@@ -74,7 +85,7 @@ void ggml_cl_init(void) {
7485
printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
7586
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
7687
CL_CHECK(err, "clCreateContext");
77-
queue = clCreateCommandQueue(context, device, 0, &err);
88+
queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
7889
CL_CHECK(err, "clCreateCommandQueue");
7990

8091
free(platforms);
@@ -93,7 +104,7 @@ void ggml_cl_init(void) {
93104
CL_CHECK(err, "clCreateKernel");
94105
}
95106

96-
void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
107+
static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
97108
if (req_size <= *cur_size) {
98109
return;
99110
}
@@ -108,11 +119,14 @@ void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_me
108119
CL_CHECK(err, "clCreateBuffer");
109120
}
110121

111-
void ggml_cl_sgemm_wrapper(const CLBlastLayout order, const CLBlastTranspose trans_a, const CLBlastTranspose trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype) {
122+
void ggml_cl_sgemm_wrapper(
123+
const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
124+
const int m, const int n, const int k,
125+
const float alpha, const void *host_a, const int lda,
126+
const float *host_b, const int ldb, const float beta,
127+
float *host_c, const int ldc, const int btype) {
112128
cl_int err = 0;
113129

114-
cl_event events[4] = { NULL };
115-
116130
cl_kernel kernel;
117131
size_t global = n * k, local, size_qb;
118132
bool dequant;
@@ -162,42 +176,46 @@ void ggml_cl_sgemm_wrapper(const CLBlastLayout order, const CLBlastTranspose tra
162176
ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
163177
ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
164178

179+
cl_event ev_a, ev_qb, ev_b;
180+
165181
if (dequant) {
166182
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
167183
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
168184
CL_CHECK(err, "clSetKernelArg");
169-
clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, events + 1);
185+
clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
170186
} else {
171-
clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, events + 1);
187+
clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
172188
}
173189

174-
clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, events);
190+
clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
175191
if (dequant) {
176-
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, events + 1, events + 3);
192+
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
177193
CL_CHECK(err, "clEnqueueNDRangeKernel");
178194
}
179-
clWaitForEvents(dequant ? 4 : 3, events);
180-
clReleaseEvent(events[0]);
181-
clReleaseEvent(events[1]);
182-
clReleaseEvent(events[2]);
195+
clWaitForEvents(1, &ev_a);
196+
clWaitForEvents(1, &ev_b);
197+
clReleaseEvent(ev_a);
198+
clReleaseEvent(ev_b);
183199
if (dequant) {
184-
clReleaseEvent(events[3]);
200+
clReleaseEvent(ev_qb);
185201
}
186202

187-
CLBlastSgemm(order,
188-
trans_a, trans_b,
203+
cl_event ev_sgemm;
204+
CLBlastSgemm((CLBlastLayout)order,
205+
(CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
189206
m, n, k,
190207
alpha,
191208
cl_buffer_a, 0, lda,
192209
cl_buffer_b, 0, ldb,
193210
beta,
194211
cl_buffer_c, 0, ldc,
195-
&queue, events);
212+
&queue, &ev_sgemm);
196213

197-
clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, events, events + 1);
214+
cl_event ev_c;
215+
clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
198216

199217
// Wait for completion
200-
clWaitForEvents(2, events);
201-
clReleaseEvent(events[0]);
202-
clReleaseEvent(events[1]);
218+
clWaitForEvents(1, &ev_c);
219+
clReleaseEvent(ev_sgemm);
220+
clReleaseEvent(ev_c);
203221
}

ggml-opencl.h

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,23 @@
11
#pragma once
22

3-
#define CL_TARGET_OPENCL_VERSION 110
4-
#include <clblast_c.h>
5-
#define MAX_CL_BUFFERS 16
6-
73
#ifdef __cplusplus
84
extern "C" {
95
#endif
106

11-
// Buffer reuse code adapted from cuda implementation by slaren
12-
#define CL_CHECK(err, name) \
13-
do { \
14-
cl_int err_ = (err); \
15-
if (err_ != CL_SUCCESS) { \
16-
fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
17-
exit(1); \
18-
} \
19-
} while (0)
7+
void ggml_cl_init(void);
208

21-
cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size);
22-
void ggml_cl_pool_free(cl_mem mem, size_t size);
9+
enum ggml_blas_order {
10+
GGML_BLAS_ORDER_ROW_MAJOR = 101,
11+
GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
12+
};
2313

24-
cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer);
25-
void ggml_cl_init(void);
14+
enum ggml_blas_op {
15+
GGML_BLAS_OP_N = 111,
16+
GGML_BLAS_OP_T = 112,
17+
GGML_BLAS_OP_C = 113,
18+
};
2619

27-
void ggml_cl_sgemm_wrapper(const CLBlastLayout order, const CLBlastTranspose trans_a, const CLBlastTranspose trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
20+
void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
2821

2922
#ifdef __cplusplus
3023
}

ggml.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7575,7 +7575,7 @@ static void ggml_compute_forward_mul_mat_f32(
75757575
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
75767576
#elif defined(GGML_USE_CLBLAST)
75777577
// zT = y * xT
7578-
ggml_cl_sgemm_wrapper(CLBlastLayoutRowMajor, CLBlastTransposeNo, CLBlastTransposeYes,
7578+
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
75797579
ne11, ne01, ne10,
75807580
1.0f, y, ne10,
75817581
x, ne10,
@@ -7809,7 +7809,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
78097809
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
78107810

78117811
// zT = y * xT
7812-
ggml_cl_sgemm_wrapper(CLBlastLayoutRowMajor, CLBlastTransposeNo, CLBlastTransposeYes,
7812+
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
78137813
ne11, ne01, ne10,
78147814
1.0f, y, ne10,
78157815
x, ne10,
@@ -8080,7 +8080,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
80808080
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
80818081
#elif defined(GGML_USE_CLBLAST)
80828082
// zT = y * xT
8083-
ggml_cl_sgemm_wrapper(CLBlastLayoutRowMajor, CLBlastTransposeNo, CLBlastTransposeYes,
8083+
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
80848084
ne11, ne01, ne10,
80858085
1.0f, y, ne10,
80868086
x, ne10,

0 commit comments

Comments
 (0)