arthw · arthw · Aug 1, 2024 · Jul 30, 2024 · Aug 1, 2024 · Aug 1, 2024
diff --git a/ggml/include/ggml-sycl.h b/ggml/include/ggml-sycl.h
@@ -38,6 +38,8 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
 GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int index);
 GGML_API GGML_CALL void ggml_sycl_set_single_device(int main_gpu_id);
 
+GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
+
 // SYCL doesn't support registering host memory, keep here for reference
 // GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
 // GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);

diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
@@ -38,6 +38,7 @@
 
 #include "ggml-sycl/backend.hpp"
 #include "ggml-sycl/presets.hpp"
+#include "ggml-sycl/sycl_device.hpp"
 
 
 void   ggml_sycl_free_data(struct ggml_tensor * tensor);
@@ -48,7 +49,7 @@ void   ggml_sycl_get_device_description(int device, char * description, size_t d
 bool   ggml_backend_is_sycl(ggml_backend_t backend);
 int    ggml_backend_sycl_get_device(ggml_backend_t backend);
 static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer);
-
+static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer);
 
 void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
                     const void *ptr_src, size_t size) {
@@ -2279,11 +2280,11 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
     for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
         int id = ggml_backend_sycl_get_device_id(i);
         if (tensor_split[i] < (i + 1 < ggml_sycl_info().device_count ? tensor_split[i + 1] : 1.0f)) {
-            if (min_compute_capability > ggml_sycl_info().devices[id].cc) {
-                min_compute_capability = ggml_sycl_info().devices[id].cc;
+            if (min_compute_capability > ggml_sycl_info().infos[id].cc) {
+                min_compute_capability = ggml_sycl_info().infos[id].cc;
             }
-            if (max_compute_capability < ggml_sycl_info().devices[id].cc) {
-                max_compute_capability = ggml_sycl_info().devices[id].cc;
+            if (max_compute_capability < ggml_sycl_info().infos[id].cc) {
+                max_compute_capability = ggml_sycl_info().infos[id].cc;
             }
         }
     }
@@ -2680,17 +2681,14 @@ static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
     }
 
 #ifdef NDEBUG
-    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-        int id = ggml_backend_sycl_get_device_id(i);
+    for (auto &id: ggml_sycl_info().ids) {
         SYCL_CHECK(ggml_sycl_set_device(id));
     }
 
-    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-        int id = ggml_backend_sycl_get_device_id(i);
+    for (auto &id: ggml_sycl_info().ids) {
         SYCL_CHECK(ggml_sycl_set_device(id));
 
-        for (int i_other = 0; i_other < ggml_sycl_info().device_count; ++i_other) {
-            int id_other = ggml_backend_sycl_get_device_id(i_other);
+        for (auto &id_other: ggml_sycl_info().ids) {
             if (id == id_other) {
                 continue;
             }
@@ -2818,8 +2816,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
         }
     }
 
-    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-        int id = ggml_backend_sycl_get_device_id(i);
+    for (auto & id: ggml_sycl_info().ids) {
         if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
             continue;
         }
@@ -2843,7 +2840,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
         } else {
             dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
         }
-
         if (convert_src1_to_q8_1) {
             dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
 
@@ -2885,8 +2881,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
         const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_SYCL_MAX_STREAMS : 0;
         const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
 
-        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-            int id = ggml_backend_sycl_get_device_id(i);
+        for (auto & id: ggml_sycl_info().ids) {
             if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
                 continue;
             }
@@ -3028,8 +3023,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
         is_max = is_max <= GGML_SYCL_MAX_STREAMS ? is_max : GGML_SYCL_MAX_STREAMS;
 
         ggml_sycl_set_device(ctx.device);
-        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-            int id = ggml_backend_sycl_get_device_id(i);
+        for (auto & id: ggml_sycl_info().ids) {
             if (dev[id].row_low == dev[id].row_high) {
                 continue;
             }
@@ -3165,8 +3159,13 @@ static void ggml_sycl_pad(ggml_backend_sycl_context & ctx, const ggml_tensor * s
 
 static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
+    // log_tensor_with_cnt(ctx, "log/src0", src0, -1);
+    // log_tensor_with_cnt(ctx, "log/src1", src1, -1);
+    // log_tensor_with_cnt(ctx, "log/dst0", dst, -1);
     ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_rms_norm);
+    // log_tensor_with_cnt(ctx, "log/dst1", dst, -1);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
+    // exit(1);
 }
 
 static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -3417,12 +3416,12 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
                 continue;
             }
 
-            if (min_compute_capability > ggml_sycl_info().devices[id].cc) {
-                min_compute_capability = ggml_sycl_info().devices[id].cc;
+            if (min_compute_capability > ggml_sycl_info().infos[id].cc) {
+                min_compute_capability = ggml_sycl_info().infos[id].cc;
             }
         }
     } else {
-        min_compute_capability    = ggml_sycl_info().devices[ctx.device].cc;
+        min_compute_capability    = ggml_sycl_info().infos[ctx.device].cc;
     }
 
     // check data types and tensor shapes for custom matrix multiplication kernels:
@@ -4332,7 +4331,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
 ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) {
     static std::mutex mutex;
     std::lock_guard<std::mutex> lock(mutex);
-
     GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
 
     check_allow_device_id(device_id);
@@ -4342,10 +4340,9 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) {
     static bool ggml_backend_sycl_buffer_type_initialized = false;
 
     if (!ggml_backend_sycl_buffer_type_initialized) {
-        for (int i = 0; i < ggml_sycl_info().device_count; i++) {
-            int id = ggml_backend_sycl_get_device_id(i);
+        for (auto & id: ggml_sycl_info().ids) {
             auto & device = dpct::dev_mgr::instance().get_device(id);
-            queue_ptr stream = &(device.default_queue());
+            queue_ptr stream = ggml_sycl_info().infos[id].qptrs[0];
             ggml_backend_sycl_buffer_types[id] = {
                 /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
                 /* .context  = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), stream},
@@ -4366,8 +4363,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_conte
     static bool ggml_backend_sycl_buffer_type_initialized = false;
 
     if (!ggml_backend_sycl_buffer_type_initialized) {
-        for (int i = 0; i < ggml_sycl_info().device_count; i++) {
-            int id = ggml_backend_sycl_get_device_id(i);
+        for (auto & id: ggml_sycl_info().ids) {
             ggml_backend_sycl_buffer_types[id] = {
                 /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
                 /* .context  = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), ctx->stream(id, 0)},
@@ -4396,8 +4392,7 @@ static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tens
 struct ggml_backend_sycl_split_buffer_context {
     ~ggml_backend_sycl_split_buffer_context() try {
         for (ggml_tensor_extra_gpu * extra : tensor_extras) {
-            for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-                int id = ggml_backend_sycl_get_device_id(i);
+            for (auto & id: ggml_sycl_info().ids) {
                 for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
                     if (extra->events[id][is] != nullptr) {
                         /*
@@ -5148,6 +5143,13 @@ GGML_CALL int ggml_backend_sycl_get_device_count() {
     return ggml_sycl_info().device_count;
 }
 
+GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
+
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_single_device_mode\n");
+    fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
+    ggml_sycl_info(main_gpu_id);
+}
+
 GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) {
     ggml_backend_t sycl_backend = ggml_backend_sycl_init((int) (intptr_t) user_data);
     return sycl_backend;
@@ -5159,8 +5161,7 @@ extern "C" int ggml_backend_sycl_reg_devices();
 
 int ggml_backend_sycl_reg_devices() {
     assert(ggml_sycl_info().device_count>0);
-    for (int i = 0; i < ggml_sycl_info().device_count; i++) {
-        int id = ggml_backend_sycl_get_device_id(i);
+    for (auto & id: ggml_sycl_info().ids) {
         char name[128];
         snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, id);
         ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(id), (void *) (intptr_t) id);