ggml-org · qnixsynapse · Jan 30, 2025 · Jan 31, 2025 · Jan 31, 2025 · Jan 31, 2025
diff --git a/ggml/include/ggml-sycl.h b/ggml/include/ggml-sycl.h
@@ -9,9 +9,6 @@
 #include "ggml.h"
 #include "ggml-backend.h"
 
-#define GGML_SYCL_NAME "SYCL"
-#define GGML_SYCL_MAX_DEVICES 48
-
 #ifdef  __cplusplus
 extern "C" {
 #endif

diff --git a/ggml/src/ggml-sycl/argmax.cpp b/ggml/src/ggml-sycl/argmax.cpp
@@ -0,0 +1,75 @@
+#include "argmax.hpp"
+
+static void argmax_f32_i32_sycl(const float * x, int * dst, const int ncols, const int nrows, queue_ptr stream) {
+    const sycl::range<3> block_dims(1, 1, SYCL_ARGMAX_BLOCK_SIZE);
+    const sycl::range<3> block_nums(1, nrows, 1);
+    const size_t         shared_mem = 256 * sizeof(float);
+
+    stream->submit([&](sycl::handler & cgh) {
+        sycl::local_accessor<float, 1> shared_data(sycl::range<1>(shared_mem / sizeof(float)), cgh);
+        sycl::local_accessor<int, 1>   shared_indices(sycl::range<1>(shared_mem / sizeof(float)), cgh);
+
+        cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+            const int tid = item_ct1.get_local_id(2);
+            const int row = item_ct1.get_global_id(1);
+
+            float max_val = -INFINITY;
+            int   max_idx = -1;
+
+            for (int col = tid; col < ncols; col += 256) {
+                float val = x[row * ncols + col];
+                if (val > max_val) {
+                    max_val = val;
+                    max_idx = col;
+                }
+            }
+
+            shared_data[tid]    = max_val;
+            shared_indices[tid] = max_idx;
+            item_ct1.barrier(sycl::access::fence_space::local_space);
+
+            for (int stride = 256 / 2; stride > 0; stride >>= 1) {
+                if (tid < stride) {
+                    float val1 = shared_data[tid];
+                    float val2 = shared_data[tid + stride];
+                    if (val2 > val1) {
+                        shared_data[tid]    = val2;
+                        shared_indices[tid] = shared_indices[tid + stride];
+                    }
+                }
+                item_ct1.barrier(sycl::access::fence_space::local_space);
+            }
+
+            if (tid == 0) {
+                dst[row] = shared_indices[0];
+            }
+        });
+    });
+}
+
+static void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try {
+    GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
+
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_I32);
+    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer));
+
+    const int64_t ncols = dst->src[0]->ne[0];
+    const int64_t nrows = ggml_nrows(dst->src[0]);
+
+    dpct::queue_ptr main_stream = ctx.stream();
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+    const float *   src0_dd     = static_cast<const float *>(dst->src[0]->data);
+    int32_t *       dst_dd      = static_cast<int32_t *>(dst->data);
+    argmax_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
+} catch (const sycl::exception & exc) {
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
+}
+
+void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_argmax(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
diff --git a/ggml/src/ggml-sycl/argmax.hpp b/ggml/src/ggml-sycl/argmax.hpp
@@ -0,0 +1,8 @@
+#ifndef GGML_SYCL_ARGMAX_HPP
+#define GGML_SYCL_ARGMAX_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+#endif // GGML_SYCL_ARGMAX_HPP
diff --git a/ggml/src/ggml-sycl/argsort.cpp b/ggml/src/ggml-sycl/argsort.cpp
@@ -0,0 +1,130 @@
+#include "argsort.hpp"
+
+static int next_power_of_2(int x) {
+    int n = 1;
+    while (n < x) {
+        n *= 2;
+    }
+    return n;
+}
+
+template <typename T>
+static inline void ggml_sycl_swap(T & a, T & b) {
+    T tmp = a;
+    a     = b;
+    b     = tmp;
+}
+
+template <ggml_sort_order order>
+__dpct_inline__ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad,
+                                              const sycl::nd_item<3> & item_ct1, uint8_t * dpct_local) {
+    // bitonic sort
+    int col = item_ct1.get_local_id(2);
+    int row = item_ct1.get_group(1);
+
+    if (col >= ncols_pad) {
+        return;
+    }
+
+    const float * x_row   = x + row * ncols;
+    auto          dst_row = (int *) dpct_local;
+
+    // initialize indices
+    dst_row[col] = col;
+
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+
+    for (int k = 2; k <= ncols_pad; k *= 2) {
+        for (int j = k / 2; j > 0; j /= 2) {
+            int ixj = col ^ j;
+            if (ixj > col) {
+                if ((col & k) == 0) {
+                    if (dst_row[col] >= ncols ||
+                        (dst_row[ixj] < ncols &&
+                         (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] :
+                                                         x_row[dst_row[col]] < x_row[dst_row[ixj]]))) {
+                        ggml_sycl_swap(dst_row[col], dst_row[ixj]);
+                    }
+                } else {
+                    if (dst_row[ixj] >= ncols ||
+                        (dst_row[col] < ncols &&
+                         (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] :
+                                                         x_row[dst_row[col]] > x_row[dst_row[ixj]]))) {
+                        ggml_sycl_swap(dst_row[col], dst_row[ixj]);
+                    }
+                }
+            }
+            /*
+            DPCT1118:1: SYCL group functions and algorithms must be encountered
+            in converged control flow. You may need to adjust the code.
+            */
+            item_ct1.barrier(sycl::access::fence_space::local_space);
+        }
+    }
+
+    // copy the result to dst without the padding
+    if (col < ncols) {
+        dst[row * ncols + col] = dst_row[col];
+    }
+}
+
+static void argsort_f32_i32_sycl(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order,
+                                 queue_ptr stream) {
+    // bitonic sort requires ncols to be power of 2
+    const int ncols_pad = next_power_of_2(ncols);
+
+    const sycl::range<3> block_dims(1, 1, ncols_pad);
+    const sycl::range<3> block_nums(1, nrows, 1);
+    const size_t         shared_mem = ncols_pad * sizeof(int);
+
+    if (order == GGML_SORT_ORDER_ASC) {
+        stream->submit([&](sycl::handler & cgh) {
+            sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(sycl::range<1>(shared_mem), cgh);
+
+            cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(
+                    x, dst, ncols, ncols_pad, item_ct1,
+                    dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>().get());
+            });
+        });
+    } else if (order == GGML_SORT_ORDER_DESC) {
+        stream->submit([&](sycl::handler & cgh) {
+            sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(sycl::range<1>(shared_mem), cgh);
+
+            cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(
+                    x, dst, ncols, ncols_pad, item_ct1,
+                    dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>().get());
+            });
+        });
+    } else {
+        GGML_ABORT("fatal error");
+    }
+}
+
+inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_I32);
+    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer));
+
+    const int64_t ncols = dst->src[0]->ne[0];
+    const int64_t nrows = ggml_nrows(dst->src[0]);
+
+    enum ggml_sort_order order       = (enum ggml_sort_order) dst->op_params[0];
+    dpct::queue_ptr      main_stream = ctx.stream();
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+    const float *        src0_dd     = static_cast<const float *>(dst->src[0]->data);
+    int32_t *            dst_dd      = static_cast<int32_t *>(dst->data);
+
+    argsort_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, order, main_stream);
+} catch (const sycl::exception & exc) {
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
+}
+
+void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_argsort(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
diff --git a/ggml/src/ggml-sycl/argsort.hpp b/ggml/src/ggml-sycl/argsort.hpp
@@ -0,0 +1,8 @@
+#ifndef GGML_SYCL_ARGSORT_HPP
+#define GGML_SYCL_ARGSORT_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+#endif  // GGML_SYCL_ARGSORT_HPP
diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp
@@ -29,6 +29,16 @@
 #include "wkv6.hpp"
 #include "outprod.hpp"
 #include "element_wise.hpp"
+#include "binbcast.hpp"
+#include "argmax.hpp"
+#include "argsort.hpp"
+#include "cpy.hpp"
+#include "getrows.hpp"
+#include "diagmask.hpp"
+#include "scale.hpp"
+#include "clamp.hpp"
+#include "pool2d.hpp"
+#include "sum.hpp"
 #include "gla.hpp"
 
 #endif // GGML_SYCL_BACKEND_HPP