CodeLinaro
diff --git a/‎ggml/CMakeLists.txt
Lines changed: 6 additions & 0 deletions b/‎ggml/CMakeLists.txt
Lines changed: 6 additions & 0 deletions
diff --git a/‎ggml/include/ggml-alloc.h
Lines changed: 1 addition & 0 deletions b/‎ggml/include/ggml-alloc.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎ggml/include/ggml-opencl2.h
Lines changed: 41 additions & 0 deletions b/‎ggml/include/ggml-opencl2.h
Lines changed: 41 additions & 0 deletions
diff --git a/‎ggml/src/CMakeLists.txt
Lines changed: 14 additions & 0 deletions b/‎ggml/src/CMakeLists.txt
Lines changed: 14 additions & 0 deletions
diff --git a/‎ggml/src/ggml-alloc.c
Lines changed: 86 additions & 0 deletions b/‎ggml/src/ggml-alloc.c
Lines changed: 86 additions & 0 deletions
diff --git a/‎ggml/src/ggml-backend-reg.cpp
Lines changed: 7 additions & 0 deletions b/‎ggml/src/ggml-backend-reg.cpp
Lines changed: 7 additions & 0 deletions
diff --git a/‎ggml/src/ggml-opencl2/CMakeLists.txt
Lines changed: 159 additions & 0 deletions b/‎ggml/src/ggml-opencl2/CMakeLists.txt
Lines changed: 159 additions & 0 deletions
@@ -170,6 +170,12 @@ option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl ca
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
                                             "ggml: sycl target device")
 
+option(GGML_OPENCL                          "ggml: use OpenCL"                                OFF)
+option(GGML_OPENCL_SMALL_ALLOC              "ggml: use small allocation for tensors"          ON)
+option(GGML_OPENCL_PROFILING                "ggml: use OpenCL profiling (increases overhead)" OFF)
+option(GGML_OPENCL_EMBED_KERNELS            "ggml: embed kernels"                             ON)
+option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adreno"          ON)
+
 # extra artifacts
 option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE})
 option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
 
@@ -69,6 +69,7 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft_for_weights(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
 
 #ifdef  __cplusplus
 
@@ -0,0 +1,41 @@
+// SPDX-FileCopyrightText: Copyright (c) Qualcomm Innovation Center, Inc. All rights reserved
+// SPDX-License-Identifier: MIT
+
+#ifndef GGML_OPENCL2_H
+#define GGML_OPENCL2_H
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define CL_CHECK(err)                                               \
+    do {                                                            \
+        cl_int err_ = (err);                                        \
+        if (err_ != CL_SUCCESS) {                                   \
+            fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n",  \
+                #err, err_, __FILE__, __LINE__);                    \
+            GGML_ASSERT(0);                                                \
+        }                                                           \
+    } while (0)
+
+//
+// backend API
+//
+GGML_BACKEND_API ggml_backend_t ggml_backend_opencl2_init(void);
+GGML_BACKEND_API bool ggml_backend_is_opencl2(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl2_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl2_host_buffer_type(void);
+
+GGML_BACKEND_API ggml_backend_t ggml_backend_reg_opencl2_init(const char * params, void * user_data);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl2_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif // GGML_OPENCL2_H
@@ -246,6 +246,19 @@ function(ggml_add_backend backend)
     endif()
 endfunction()
 
+# TODO: This is intrusive. We intend to remove SMALL_ALLOC path once the we fully
+# migrate to the non SMALL_ALLOC path. Also need to converge on the backend name
+# so we don't need this name conversion.
+if (GGML_OPENCL)
+    set(GGML_OPENCL2 ON)
+    add_compile_definitions(GGML_USE_OPENCL)
+    if (GGML_OPENCL_SMALL_ALLOC)
+        add_compile_definitions(GGML_OPENCL_SMALL_ALLOC)
+    endif ()
+else ()
+    set(GGML_OPENCL2 OFF)
+endif ()
+
 ggml_add_backend(CPU)
 ggml_add_backend(AMX)
 ggml_add_backend(BLAS)
@@ -257,6 +270,7 @@ ggml_add_backend(METAL)
 ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
 ggml_add_backend(Vulkan)
+ggml_add_backend(OpenCL2)
 ggml_add_backend(MUSA)
 
 foreach (target ggml-base ggml)
 
@@ -1035,6 +1035,92 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
     return buffer;
 }
 
+ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_for_weights(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
+#ifndef GGML_OPENCL_SMALL_ALLOC
+    return ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+#else
+    // Small allocation allocates a separate buffer for each tensor. Instead of
+    // collecting multiple tensors to allocate a large buffer, each tensor is
+    // allocated a buffer immediately. This is only supposed to be used for
+    // weights tensors (note that weights can be f32).
+    GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
+
+    size_t alignment = ggml_backend_buft_get_alignment(buft);
+
+    ggml_backend_buffer_t * buffers = NULL;
+    size_t n_buffers = 0;
+
+    struct ggml_tensor * first_view = NULL;
+    struct ggml_tensor * first = ggml_get_first_tensor(ctx);
+    for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+        size_t this_size = 0;
+        if (t->data == NULL && t->view_src == NULL) {
+            // Tensor size must be properly padded.
+            this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
+        }
+
+        // The allocation logic here has gone beyond intention in order to make
+        // `test-backend-ops` work. The very initial intention was to allocate
+        // memory for weights - each weight tensor gets its own buffer object.
+        // The original function should be used to allocate for intermediate tensors.
+        // There are usually no view tensors for weights; this is not true for
+        // intermediate tensors. However, in `test-backend-ops` there is no
+        // differetiation between weight tensors and intermediate tensors.
+        // This function is used for general allocation when small allocation is
+        // enabled in the test. This requires the function to also handle view
+        // tensors, which do no require actual allocation. In the original function,
+        // view tensors are allocated with other non-view tensors since view tensors
+        // sizes are 0.
+        // Here, we try to identify view tensors and allocate them with the next
+        // non-view tensor. View tensors cannot allocated (alone) but must be
+        // initialized (together with non-view tensors).
+
+        // This is a view tensor of its size if 0. Record its location if it is the
+        // first one after a non-view tensor. If the next tensor is still a view,
+        // simply go to the next. We want to allocate all consecutive view tensors
+        // together with the next non-view tensor.
+        if (this_size == 0 && first_view == NULL) {
+            first_view = t;
+            continue;
+        }
+
+        if (first_view) {
+            // This is a non-view tensor. If there are any view tensors before
+            // this non-view tensor, we want to allocate these view tensors and
+            // this non-view tensor together.
+            // The first tensor to allocate is the first view tensor.
+            first = first_view;
+        } else {
+            // Otherwise, allocate this non-view tensor immediately.
+            first = t;
+        }
+
+        if (!alloc_tensor_range(ctx, first, ggml_get_next_tensor(ctx, t), buft, this_size, &buffers, &n_buffers)) {
+            return NULL;
+        }
+
+        // Always reset first_view after a non-view tensor.
+        first_view = NULL;
+    }
+
+    if (n_buffers == 0) {
+#ifndef NDEBUG
+        fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
+#endif
+        return NULL;
+    }
+
+    ggml_backend_buffer_t buffer;
+    if (n_buffers == 1) {
+        buffer = buffers[0];
+    } else {
+        buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
+    }
+    free(buffers);
+    return buffer;
+#endif
+}
+
 ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
     return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
 }
@@ -23,6 +23,10 @@
 #include "ggml-vulkan.h"
 #endif
 
+#ifdef GGML_USE_OPENCL2
+#include "ggml-opencl2.h"
+#endif
+
 #ifdef GGML_USE_BLAS
 #include "ggml-blas.h"
 #endif
@@ -60,6 +64,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_VULKAN
         register_backend(ggml_backend_vk_reg());
 #endif
+#ifdef GGML_USE_OPENCL2
+        register_backend(ggml_backend_opencl2_reg());
+#endif
 #ifdef GGML_USE_CANN
         register_backend(ggml_backend_cann_reg());
 #endif
 
@@ -0,0 +1,159 @@
+
+find_package(OpenCL)
+
+if (OpenCL_FOUND)
+    find_package(Python3 REQUIRED)
+
+    set(TARGET_NAME ggml-opencl2)
+
+    add_library(${TARGET_NAME}
+                ggml-opencl2.cpp
+                ../../include/ggml-opencl2.h)
+    target_link_libraries(${TARGET_NAME} PRIVATE ggml-base ${OpenCL_LIBRARIES})
+    target_include_directories(${TARGET_NAME} PRIVATE . .. ${OpenCL_INCLUDE_DIRS})
+
+    # TODO - this is kind of strange. We have been calling this backend OpenCL2,
+    # so everything (function names, folder name, etc) except macro switches
+    # has been OpenCL2. Now, the backend frameworke enforces the use of the folder
+    # name as the backend name and switch. So, GGML_USE_OPENCL2 is used in
+    # ggml-backend-reg.cpp, but the rest still uses GGML_USE_OPENCL.
+    add_compile_definitions(GGML_USE_OPENCL)
+
+    if (GGML_OPENCL_PROFILING)
+        message(STATUS "OpenCL profiling enabled (increases CPU overhead)")
+        add_compile_definitions(GGML_OPENCL_PROFILING)
+    endif ()
+
+    add_compile_definitions(GGML_OPENCL_SOA_Q)
+
+    if (GGML_OPENCL_SMALL_ALLOC)
+        message(STATUS "OpenCL will allocate a separate buffer for each tensor. "
+            "The default behavior allocates a large buffer to hold multiple tensors.")
+        add_compile_definitions(GGML_OPENCL_SMALL_ALLOC)
+    endif ()
+
+    if (GGML_OPENCL_USE_ADRENO_KERNELS)
+        message(STATUS "OpenCL will use matmul kernels optimized for Adreno")
+        add_compile_definitions(GGML_OPENCL_USE_ADRENO_KERNELS)
+    endif ()
+
+    if (GGML_OPENCL_EMBED_KERNELS)
+        add_compile_definitions(GGML_OPENCL_EMBED_KERNELS)
+
+        set(OPENCL2_CL_SOURCE_EMBED         "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2.cl.h")
+        set(OPENCL2_MM_CL_SOURCE_EMBED      "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_mm.cl.h")
+        set(OPENCL2_CVT_CL_SOURCE_EMBED     "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_cvt.cl.h")
+
+        set(OPENCL2_GEMV_NOSHUFFLE_SOURCE_EMBED             "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_gemv_noshuffle.cl.h")
+        set(OPENCL2_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED     "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_gemv_noshuffle_general.cl.h")
+        set(OPENCL2_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED          "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_mul_mat_Ab_Bi_8x4.cl.h")
+        set(OPENCL2_TRANSPOSE_16_SOURCE_EMBED               "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_transpose_16.cl.h")
+        set(OPENCL2_TRANSPOSE_32_SOURCE_EMBED               "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_transpose_32.cl.h")
+        set(OPENCL2_TRANSPOSE_32_16_SOURCE_EMBED            "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_transpose_32_16.cl.h")
+
+        set(EMBED_KERNEL_SCRIPT             "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py")
+        file(MAKE_DIRECTORY                 "${CMAKE_BINARY_DIR}/autogenerated")
+
+        include_directories("${CMAKE_BINARY_DIR}/autogenerated")
+
+        # Python must be accessible from command line
+        add_custom_command(
+            OUTPUT ${OPENCL2_CL_SOURCE_EMBED}
+            COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+                ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2.cl
+                ${OPENCL2_CL_SOURCE_EMBED}
+            DEPENDS kernels/ggml-opencl2.cl ${EMBED_KERNEL_SCRIPT}
+            COMMENT "Generate ggml-opencl2.cl.h"
+        )
+
+        add_custom_command(
+            OUTPUT ${OPENCL2_MM_CL_SOURCE_EMBED}
+            COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+                ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_mm.cl
+                ${OPENCL2_MM_CL_SOURCE_EMBED}
+            DEPENDS kernels/ggml-opencl2_mm.cl ${EMBED_KERNEL_SCRIPT}
+            COMMENT "Generate ggml-opencl2_mm.cl.h"
+        )
+
+        add_custom_command(
+            OUTPUT ${OPENCL2_CVT_CL_SOURCE_EMBED}
+            COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+                ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_cvt.cl
+                ${OPENCL2_CVT_CL_SOURCE_EMBED}
+            DEPENDS kernels/ggml-opencl2_cvt.cl ${EMBED_KERNEL_SCRIPT}
+            COMMENT "Generate ggml-opencl2_cvt.cl.h"
+        )
+
+        add_custom_command(
+            OUTPUT ${OPENCL2_GEMV_NOSHUFFLE_SOURCE_EMBED}
+            COMMAND python ${EMBED_KERNEL_SCRIPT}
+                ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_gemv_noshuffle.cl
+                ${OPENCL2_GEMV_NOSHUFFLE_SOURCE_EMBED}
+            DEPENDS kernels/ggml-opencl2_gemv_noshuffle.cl ${EMBED_KERNEL_SCRIPT}
+            COMMENT "Generate ggml-opencl2_gemv_noshuffle.cl.h"
+        )
+
+        add_custom_command(
+            OUTPUT ${OPENCL2_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
+            COMMAND python ${EMBED_KERNEL_SCRIPT}
+                ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_gemv_noshuffle_general.cl
+                ${OPENCL2_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
+            DEPENDS kernels/ggml-opencl2_gemv_noshuffle_general.cl ${EMBED_KERNEL_SCRIPT}
+            COMMENT "Generate ggml-opencl2_gemv_noshuffle_general.cl.h"
+        )
+
+        add_custom_command(
+            OUTPUT ${OPENCL2_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
+            COMMAND python ${EMBED_KERNEL_SCRIPT}
+                ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_mul_mat_Ab_Bi_8x4.cl
+                ${OPENCL2_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
+            DEPENDS kernels/ggml-opencl2_mul_mat_Ab_Bi_8x4.cl ${EMBED_KERNEL_SCRIPT}
+            COMMENT "Generate ggml-opencl2_mul_mat_Ab_Bi_8x4.cl.cl.h"
+        )
+
+        add_custom_command(
+            OUTPUT ${OPENCL2_TRANSPOSE_16_SOURCE_EMBED}
+            COMMAND python ${EMBED_KERNEL_SCRIPT}
+                ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_transpose_16.cl
+                ${OPENCL2_TRANSPOSE_16_SOURCE_EMBED}
+            DEPENDS kernels/ggml-opencl2_transpose_16.cl ${EMBED_KERNEL_SCRIPT}
+            COMMENT "Generate ggml-opencl2_transpose_16.cl.h"
+        )
+
+        add_custom_command(
+            OUTPUT ${OPENCL2_TRANSPOSE_32_SOURCE_EMBED}
+            COMMAND python ${EMBED_KERNEL_SCRIPT}
+                ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_transpose_32.cl
+                ${OPENCL2_TRANSPOSE_32_SOURCE_EMBED}
+            DEPENDS kernels/ggml-opencl2_transpose_32.cl ${EMBED_KERNEL_SCRIPT}
+            COMMENT "Generate ggml-opencl2_transpose_32.cl.h"
+        )
+
+        add_custom_command(
+            OUTPUT ${OPENCL2_TRANSPOSE_32_16_SOURCE_EMBED}
+            COMMAND python ${EMBED_KERNEL_SCRIPT}
+                ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_transpose_32_16.cl
+                ${OPENCL2_TRANSPOSE_32_16_SOURCE_EMBED}
+            DEPENDS kernels/ggml-opencl2_transpose_32_16.cl ${EMBED_KERNEL_SCRIPT}
+            COMMENT "Generate ggml-opencl2_transpose_32_16.cl.h"
+        )
+
+        target_sources(${TARGET_NAME} PRIVATE
+                       ${OPENCL2_CL_SOURCE_EMBED}
+                       ${OPENCL2_MM_CL_SOURCE_EMBED}
+                       ${OPENCL2_CVT_CL_SOURCE_EMBED}
+                       ${OPENCL2_GEMV_NOSHUFFLE_SOURCE_EMBED}
+                       ${OPENCL2_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
+                       ${OPENCL2_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
+                       ${OPENCL2_TRANSPOSE_16_SOURCE_EMBED}
+                       ${OPENCL2_TRANSPOSE_32_SOURCE_EMBED}
+                       ${OPENCL2_TRANSPOSE_32_16_SOURCE_EMBED})
+    else ()
+        # copy ggml-opencl.cl to bin directory
+        configure_file(kernels/ggml-opencl2.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl2.cl COPYONLY)
+        configure_file(kernels/ggml-opencl2_mm.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl2_mm.cl COPYONLY)
+        configure_file(kernels/ggml-opencl2_cvt.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl2_cvt.cl COPYONLY)
+    endif ()
+else ()
+    message(WARNING "OpenCL not found")
+endif ()