Skip to content

Commit c1161ec

Browse files
lhezquic-sszotshawngu-quicquic-aanguswanghqc
committed
[cl][adreno] Add Adreno GPU support
Add new OpenCL backend to support Adreno GPUs --------- Co-authored-by: Skyler Szot <[email protected]> Co-authored-by: Shangqing Gu <[email protected]> Co-authored-by: Alexander Angus <[email protected]> Co-authored-by: Hongqiang Wang <[email protected]> Co-authored-by: Max Krasnyanskiy <[email protected]>
1 parent dd3a6ce commit c1161ec

21 files changed

+9330
-1
lines changed

ggml/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,12 @@ option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl ca
170170
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
171171
"ggml: sycl target device")
172172

173+
option(GGML_OPENCL "ggml: use OpenCL" OFF)
174+
option(GGML_OPENCL_SMALL_ALLOC "ggml: use small allocation for tensors" ON)
175+
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
176+
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
177+
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
178+
173179
# extra artifacts
174180
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
175181
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})

ggml/include/ggml-alloc.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
6969
// Utils
7070
// Create a buffer and allocate all the tensors in a ggml_context
7171
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
72+
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft_for_weights(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
7273
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
7374

7475
#ifdef __cplusplus

ggml/include/ggml-opencl2.h

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// SPDX-FileCopyrightText: Copyright (c) Qualcomm Innovation Center, Inc. All rights reserved
2+
// SPDX-License-Identifier: MIT
3+
4+
#ifndef GGML_OPENCL2_H
5+
#define GGML_OPENCL2_H
6+
7+
#include "ggml.h"
8+
#include "ggml-backend.h"
9+
10+
#ifdef __cplusplus
11+
extern "C" {
12+
#endif
13+
14+
#define CL_CHECK(err) \
15+
do { \
16+
cl_int err_ = (err); \
17+
if (err_ != CL_SUCCESS) { \
18+
fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \
19+
#err, err_, __FILE__, __LINE__); \
20+
GGML_ASSERT(0); \
21+
} \
22+
} while (0)
23+
24+
//
25+
// backend API
26+
//
27+
GGML_BACKEND_API ggml_backend_t ggml_backend_opencl2_init(void);
28+
GGML_BACKEND_API bool ggml_backend_is_opencl2(ggml_backend_t backend);
29+
30+
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl2_buffer_type(void);
31+
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl2_host_buffer_type(void);
32+
33+
GGML_BACKEND_API ggml_backend_t ggml_backend_reg_opencl2_init(const char * params, void * user_data);
34+
35+
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl2_reg(void);
36+
37+
#ifdef __cplusplus
38+
}
39+
#endif
40+
41+
#endif // GGML_OPENCL2_H

ggml/src/CMakeLists.txt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,19 @@ function(ggml_add_backend backend)
246246
endif()
247247
endfunction()
248248

249+
# TODO: This is intrusive. We intend to remove SMALL_ALLOC path once the we fully
250+
# migrate to the non SMALL_ALLOC path. Also need to converge on the backend name
251+
# so we don't need this name conversion.
252+
if (GGML_OPENCL)
253+
set(GGML_OPENCL2 ON)
254+
add_compile_definitions(GGML_USE_OPENCL)
255+
if (GGML_OPENCL_SMALL_ALLOC)
256+
add_compile_definitions(GGML_OPENCL_SMALL_ALLOC)
257+
endif ()
258+
else ()
259+
set(GGML_OPENCL2 OFF)
260+
endif ()
261+
249262
ggml_add_backend(CPU)
250263
ggml_add_backend(AMX)
251264
ggml_add_backend(BLAS)
@@ -257,6 +270,7 @@ ggml_add_backend(METAL)
257270
ggml_add_backend(RPC)
258271
ggml_add_backend(SYCL)
259272
ggml_add_backend(Vulkan)
273+
ggml_add_backend(OpenCL2)
260274
ggml_add_backend(MUSA)
261275

262276
foreach (target ggml-base ggml)

ggml/src/ggml-alloc.c

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1035,6 +1035,92 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
10351035
return buffer;
10361036
}
10371037

1038+
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_for_weights(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
1039+
#ifndef GGML_OPENCL_SMALL_ALLOC
1040+
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
1041+
#else
1042+
// Small allocation allocates a separate buffer for each tensor. Instead of
1043+
// collecting multiple tensors to allocate a large buffer, each tensor is
1044+
// allocated a buffer immediately. This is only supposed to be used for
1045+
// weights tensors (note that weights can be f32).
1046+
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
1047+
1048+
size_t alignment = ggml_backend_buft_get_alignment(buft);
1049+
1050+
ggml_backend_buffer_t * buffers = NULL;
1051+
size_t n_buffers = 0;
1052+
1053+
struct ggml_tensor * first_view = NULL;
1054+
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
1055+
for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1056+
size_t this_size = 0;
1057+
if (t->data == NULL && t->view_src == NULL) {
1058+
// Tensor size must be properly padded.
1059+
this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
1060+
}
1061+
1062+
// The allocation logic here has gone beyond intention in order to make
1063+
// `test-backend-ops` work. The very initial intention was to allocate
1064+
// memory for weights - each weight tensor gets its own buffer object.
1065+
// The original function should be used to allocate for intermediate tensors.
1066+
// There are usually no view tensors for weights; this is not true for
1067+
// intermediate tensors. However, in `test-backend-ops` there is no
1068+
// differetiation between weight tensors and intermediate tensors.
1069+
// This function is used for general allocation when small allocation is
1070+
// enabled in the test. This requires the function to also handle view
1071+
// tensors, which do no require actual allocation. In the original function,
1072+
// view tensors are allocated with other non-view tensors since view tensors
1073+
// sizes are 0.
1074+
// Here, we try to identify view tensors and allocate them with the next
1075+
// non-view tensor. View tensors cannot allocated (alone) but must be
1076+
// initialized (together with non-view tensors).
1077+
1078+
// This is a view tensor of its size if 0. Record its location if it is the
1079+
// first one after a non-view tensor. If the next tensor is still a view,
1080+
// simply go to the next. We want to allocate all consecutive view tensors
1081+
// together with the next non-view tensor.
1082+
if (this_size == 0 && first_view == NULL) {
1083+
first_view = t;
1084+
continue;
1085+
}
1086+
1087+
if (first_view) {
1088+
// This is a non-view tensor. If there are any view tensors before
1089+
// this non-view tensor, we want to allocate these view tensors and
1090+
// this non-view tensor together.
1091+
// The first tensor to allocate is the first view tensor.
1092+
first = first_view;
1093+
} else {
1094+
// Otherwise, allocate this non-view tensor immediately.
1095+
first = t;
1096+
}
1097+
1098+
if (!alloc_tensor_range(ctx, first, ggml_get_next_tensor(ctx, t), buft, this_size, &buffers, &n_buffers)) {
1099+
return NULL;
1100+
}
1101+
1102+
// Always reset first_view after a non-view tensor.
1103+
first_view = NULL;
1104+
}
1105+
1106+
if (n_buffers == 0) {
1107+
#ifndef NDEBUG
1108+
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
1109+
#endif
1110+
return NULL;
1111+
}
1112+
1113+
ggml_backend_buffer_t buffer;
1114+
if (n_buffers == 1) {
1115+
buffer = buffers[0];
1116+
} else {
1117+
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
1118+
}
1119+
free(buffers);
1120+
return buffer;
1121+
#endif
1122+
}
1123+
10381124
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
10391125
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
10401126
}

ggml/src/ggml-backend-reg.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@
2323
#include "ggml-vulkan.h"
2424
#endif
2525

26+
#ifdef GGML_USE_OPENCL2
27+
#include "ggml-opencl2.h"
28+
#endif
29+
2630
#ifdef GGML_USE_BLAS
2731
#include "ggml-blas.h"
2832
#endif
@@ -60,6 +64,9 @@ struct ggml_backend_registry {
6064
#ifdef GGML_USE_VULKAN
6165
register_backend(ggml_backend_vk_reg());
6266
#endif
67+
#ifdef GGML_USE_OPENCL2
68+
register_backend(ggml_backend_opencl2_reg());
69+
#endif
6370
#ifdef GGML_USE_CANN
6471
register_backend(ggml_backend_cann_reg());
6572
#endif

ggml/src/ggml-opencl2/CMakeLists.txt

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
2+
find_package(OpenCL)
3+
4+
if (OpenCL_FOUND)
5+
find_package(Python3 REQUIRED)
6+
7+
set(TARGET_NAME ggml-opencl2)
8+
9+
add_library(${TARGET_NAME}
10+
ggml-opencl2.cpp
11+
../../include/ggml-opencl2.h)
12+
target_link_libraries(${TARGET_NAME} PRIVATE ggml-base ${OpenCL_LIBRARIES})
13+
target_include_directories(${TARGET_NAME} PRIVATE . .. ${OpenCL_INCLUDE_DIRS})
14+
15+
# TODO - this is kind of strange. We have been calling this backend OpenCL2,
16+
# so everything (function names, folder name, etc) except macro switches
17+
# has been OpenCL2. Now, the backend frameworke enforces the use of the folder
18+
# name as the backend name and switch. So, GGML_USE_OPENCL2 is used in
19+
# ggml-backend-reg.cpp, but the rest still uses GGML_USE_OPENCL.
20+
add_compile_definitions(GGML_USE_OPENCL)
21+
22+
if (GGML_OPENCL_PROFILING)
23+
message(STATUS "OpenCL profiling enabled (increases CPU overhead)")
24+
add_compile_definitions(GGML_OPENCL_PROFILING)
25+
endif ()
26+
27+
add_compile_definitions(GGML_OPENCL_SOA_Q)
28+
29+
if (GGML_OPENCL_SMALL_ALLOC)
30+
message(STATUS "OpenCL will allocate a separate buffer for each tensor. "
31+
"The default behavior allocates a large buffer to hold multiple tensors.")
32+
add_compile_definitions(GGML_OPENCL_SMALL_ALLOC)
33+
endif ()
34+
35+
if (GGML_OPENCL_USE_ADRENO_KERNELS)
36+
message(STATUS "OpenCL will use matmul kernels optimized for Adreno")
37+
add_compile_definitions(GGML_OPENCL_USE_ADRENO_KERNELS)
38+
endif ()
39+
40+
if (GGML_OPENCL_EMBED_KERNELS)
41+
add_compile_definitions(GGML_OPENCL_EMBED_KERNELS)
42+
43+
set(OPENCL2_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2.cl.h")
44+
set(OPENCL2_MM_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_mm.cl.h")
45+
set(OPENCL2_CVT_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_cvt.cl.h")
46+
47+
set(OPENCL2_GEMV_NOSHUFFLE_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_gemv_noshuffle.cl.h")
48+
set(OPENCL2_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_gemv_noshuffle_general.cl.h")
49+
set(OPENCL2_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_mul_mat_Ab_Bi_8x4.cl.h")
50+
set(OPENCL2_TRANSPOSE_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_transpose_16.cl.h")
51+
set(OPENCL2_TRANSPOSE_32_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_transpose_32.cl.h")
52+
set(OPENCL2_TRANSPOSE_32_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_transpose_32_16.cl.h")
53+
54+
set(EMBED_KERNEL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py")
55+
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
56+
57+
include_directories("${CMAKE_BINARY_DIR}/autogenerated")
58+
59+
# Python must be accessible from command line
60+
add_custom_command(
61+
OUTPUT ${OPENCL2_CL_SOURCE_EMBED}
62+
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
63+
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2.cl
64+
${OPENCL2_CL_SOURCE_EMBED}
65+
DEPENDS kernels/ggml-opencl2.cl ${EMBED_KERNEL_SCRIPT}
66+
COMMENT "Generate ggml-opencl2.cl.h"
67+
)
68+
69+
add_custom_command(
70+
OUTPUT ${OPENCL2_MM_CL_SOURCE_EMBED}
71+
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
72+
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_mm.cl
73+
${OPENCL2_MM_CL_SOURCE_EMBED}
74+
DEPENDS kernels/ggml-opencl2_mm.cl ${EMBED_KERNEL_SCRIPT}
75+
COMMENT "Generate ggml-opencl2_mm.cl.h"
76+
)
77+
78+
add_custom_command(
79+
OUTPUT ${OPENCL2_CVT_CL_SOURCE_EMBED}
80+
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
81+
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_cvt.cl
82+
${OPENCL2_CVT_CL_SOURCE_EMBED}
83+
DEPENDS kernels/ggml-opencl2_cvt.cl ${EMBED_KERNEL_SCRIPT}
84+
COMMENT "Generate ggml-opencl2_cvt.cl.h"
85+
)
86+
87+
add_custom_command(
88+
OUTPUT ${OPENCL2_GEMV_NOSHUFFLE_SOURCE_EMBED}
89+
COMMAND python ${EMBED_KERNEL_SCRIPT}
90+
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_gemv_noshuffle.cl
91+
${OPENCL2_GEMV_NOSHUFFLE_SOURCE_EMBED}
92+
DEPENDS kernels/ggml-opencl2_gemv_noshuffle.cl ${EMBED_KERNEL_SCRIPT}
93+
COMMENT "Generate ggml-opencl2_gemv_noshuffle.cl.h"
94+
)
95+
96+
add_custom_command(
97+
OUTPUT ${OPENCL2_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
98+
COMMAND python ${EMBED_KERNEL_SCRIPT}
99+
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_gemv_noshuffle_general.cl
100+
${OPENCL2_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
101+
DEPENDS kernels/ggml-opencl2_gemv_noshuffle_general.cl ${EMBED_KERNEL_SCRIPT}
102+
COMMENT "Generate ggml-opencl2_gemv_noshuffle_general.cl.h"
103+
)
104+
105+
add_custom_command(
106+
OUTPUT ${OPENCL2_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
107+
COMMAND python ${EMBED_KERNEL_SCRIPT}
108+
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_mul_mat_Ab_Bi_8x4.cl
109+
${OPENCL2_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
110+
DEPENDS kernels/ggml-opencl2_mul_mat_Ab_Bi_8x4.cl ${EMBED_KERNEL_SCRIPT}
111+
COMMENT "Generate ggml-opencl2_mul_mat_Ab_Bi_8x4.cl.cl.h"
112+
)
113+
114+
add_custom_command(
115+
OUTPUT ${OPENCL2_TRANSPOSE_16_SOURCE_EMBED}
116+
COMMAND python ${EMBED_KERNEL_SCRIPT}
117+
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_transpose_16.cl
118+
${OPENCL2_TRANSPOSE_16_SOURCE_EMBED}
119+
DEPENDS kernels/ggml-opencl2_transpose_16.cl ${EMBED_KERNEL_SCRIPT}
120+
COMMENT "Generate ggml-opencl2_transpose_16.cl.h"
121+
)
122+
123+
add_custom_command(
124+
OUTPUT ${OPENCL2_TRANSPOSE_32_SOURCE_EMBED}
125+
COMMAND python ${EMBED_KERNEL_SCRIPT}
126+
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_transpose_32.cl
127+
${OPENCL2_TRANSPOSE_32_SOURCE_EMBED}
128+
DEPENDS kernels/ggml-opencl2_transpose_32.cl ${EMBED_KERNEL_SCRIPT}
129+
COMMENT "Generate ggml-opencl2_transpose_32.cl.h"
130+
)
131+
132+
add_custom_command(
133+
OUTPUT ${OPENCL2_TRANSPOSE_32_16_SOURCE_EMBED}
134+
COMMAND python ${EMBED_KERNEL_SCRIPT}
135+
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_transpose_32_16.cl
136+
${OPENCL2_TRANSPOSE_32_16_SOURCE_EMBED}
137+
DEPENDS kernels/ggml-opencl2_transpose_32_16.cl ${EMBED_KERNEL_SCRIPT}
138+
COMMENT "Generate ggml-opencl2_transpose_32_16.cl.h"
139+
)
140+
141+
target_sources(${TARGET_NAME} PRIVATE
142+
${OPENCL2_CL_SOURCE_EMBED}
143+
${OPENCL2_MM_CL_SOURCE_EMBED}
144+
${OPENCL2_CVT_CL_SOURCE_EMBED}
145+
${OPENCL2_GEMV_NOSHUFFLE_SOURCE_EMBED}
146+
${OPENCL2_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
147+
${OPENCL2_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
148+
${OPENCL2_TRANSPOSE_16_SOURCE_EMBED}
149+
${OPENCL2_TRANSPOSE_32_SOURCE_EMBED}
150+
${OPENCL2_TRANSPOSE_32_16_SOURCE_EMBED})
151+
else ()
152+
# copy ggml-opencl.cl to bin directory
153+
configure_file(kernels/ggml-opencl2.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl2.cl COPYONLY)
154+
configure_file(kernels/ggml-opencl2_mm.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl2_mm.cl COPYONLY)
155+
configure_file(kernels/ggml-opencl2_cvt.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl2_cvt.cl COPYONLY)
156+
endif ()
157+
else ()
158+
message(WARNING "OpenCL not found")
159+
endif ()

0 commit comments

Comments
 (0)