38
38
39
39
#include " ggml-sycl/backend.hpp"
40
40
#include " ggml-sycl/presets.hpp"
41
+ #include " ggml-sycl/sycl_device.hpp"
41
42
42
43
43
44
void ggml_sycl_free_data (struct ggml_tensor * tensor);
@@ -48,7 +49,7 @@ void ggml_sycl_get_device_description(int device, char * description, size_t d
48
49
bool ggml_backend_is_sycl (ggml_backend_t backend);
49
50
int ggml_backend_sycl_get_device (ggml_backend_t backend);
50
51
static bool ggml_backend_buffer_is_sycl_split (ggml_backend_buffer_t buffer);
51
-
52
+ static bool ggml_backend_buffer_is_sycl ( ggml_backend_buffer_t buffer);
52
53
53
54
void dev2dev_memcpy (sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
54
55
const void *ptr_src, size_t size) {
@@ -2279,11 +2280,11 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
2279
2280
for (int i = 0 ; i < ggml_sycl_info ().device_count ; ++i) {
2280
2281
int id = ggml_backend_sycl_get_device_id (i);
2281
2282
if (tensor_split[i] < (i + 1 < ggml_sycl_info ().device_count ? tensor_split[i + 1 ] : 1 .0f )) {
2282
- if (min_compute_capability > ggml_sycl_info ().devices [id].cc ) {
2283
- min_compute_capability = ggml_sycl_info ().devices [id].cc ;
2283
+ if (min_compute_capability > ggml_sycl_info ().infos [id].cc ) {
2284
+ min_compute_capability = ggml_sycl_info ().infos [id].cc ;
2284
2285
}
2285
- if (max_compute_capability < ggml_sycl_info ().devices [id].cc ) {
2286
- max_compute_capability = ggml_sycl_info ().devices [id].cc ;
2286
+ if (max_compute_capability < ggml_sycl_info ().infos [id].cc ) {
2287
+ max_compute_capability = ggml_sycl_info ().infos [id].cc ;
2287
2288
}
2288
2289
}
2289
2290
}
@@ -2680,17 +2681,14 @@ static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
2680
2681
}
2681
2682
2682
2683
#ifdef NDEBUG
2683
- for (int i = 0 ; i < ggml_sycl_info ().device_count ; ++i) {
2684
- int id = ggml_backend_sycl_get_device_id (i);
2684
+ for (auto &id: ggml_sycl_info ().ids ) {
2685
2685
SYCL_CHECK (ggml_sycl_set_device (id));
2686
2686
}
2687
2687
2688
- for (int i = 0 ; i < ggml_sycl_info ().device_count ; ++i) {
2689
- int id = ggml_backend_sycl_get_device_id (i);
2688
+ for (auto &id: ggml_sycl_info ().ids ) {
2690
2689
SYCL_CHECK (ggml_sycl_set_device (id));
2691
2690
2692
- for (int i_other = 0 ; i_other < ggml_sycl_info ().device_count ; ++i_other) {
2693
- int id_other = ggml_backend_sycl_get_device_id (i_other);
2691
+ for (auto &id_other: ggml_sycl_info ().ids ) {
2694
2692
if (id == id_other) {
2695
2693
continue ;
2696
2694
}
@@ -2818,8 +2816,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
2818
2816
}
2819
2817
}
2820
2818
2821
- for (int i = 0 ; i < ggml_sycl_info ().device_count ; ++i) {
2822
- int id = ggml_backend_sycl_get_device_id (i);
2819
+ for (auto & id: ggml_sycl_info ().ids ) {
2823
2820
if ((!split && id != ctx.device ) || dev[id].row_low == dev[id].row_high ) {
2824
2821
continue ;
2825
2822
}
@@ -2843,7 +2840,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
2843
2840
} else {
2844
2841
dev[id].src1_ddf = dev[id].src1_ddf_alloc .alloc (ctx.pool (id), ggml_nelements (src1));
2845
2842
}
2846
-
2847
2843
if (convert_src1_to_q8_1) {
2848
2844
dev[id].src1_ddq = dev[id].src1_ddq_alloc .alloc (ctx.pool (id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
2849
2845
@@ -2885,8 +2881,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
2885
2881
const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_SYCL_MAX_STREAMS : 0 ;
2886
2882
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
2887
2883
2888
- for (int i = 0 ; i < ggml_sycl_info ().device_count ; ++i) {
2889
- int id = ggml_backend_sycl_get_device_id (i);
2884
+ for (auto & id: ggml_sycl_info ().ids ) {
2890
2885
if ((!split && id != ctx.device ) || dev[id].row_low == dev[id].row_high ) {
2891
2886
continue ;
2892
2887
}
@@ -3028,8 +3023,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
3028
3023
is_max = is_max <= GGML_SYCL_MAX_STREAMS ? is_max : GGML_SYCL_MAX_STREAMS;
3029
3024
3030
3025
ggml_sycl_set_device (ctx.device );
3031
- for (int i = 0 ; i < ggml_sycl_info ().device_count ; ++i) {
3032
- int id = ggml_backend_sycl_get_device_id (i);
3026
+ for (auto & id: ggml_sycl_info ().ids ) {
3033
3027
if (dev[id].row_low == dev[id].row_high ) {
3034
3028
continue ;
3035
3029
}
@@ -3165,8 +3159,13 @@ static void ggml_sycl_pad(ggml_backend_sycl_context & ctx, const ggml_tensor * s
3165
3159
3166
3160
static void ggml_sycl_rms_norm (ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3167
3161
GGML_SYCL_DEBUG (" call %s\n " , __func__);
3162
+ // log_tensor_with_cnt(ctx, "log/src0", src0, -1);
3163
+ // log_tensor_with_cnt(ctx, "log/src1", src1, -1);
3164
+ // log_tensor_with_cnt(ctx, "log/dst0", dst, -1);
3168
3165
ggml_sycl_op_flatten (ctx, src0, src1, dst, ggml_sycl_op_rms_norm);
3166
+ // log_tensor_with_cnt(ctx, "log/dst1", dst, -1);
3169
3167
GGML_SYCL_DEBUG (" call %s done\n " , __func__);
3168
+ // exit(1);
3170
3169
}
3171
3170
3172
3171
static void ggml_sycl_mul_mat_vec_p021 (ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -3417,12 +3416,12 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
3417
3416
continue ;
3418
3417
}
3419
3418
3420
- if (min_compute_capability > ggml_sycl_info ().devices [id].cc ) {
3421
- min_compute_capability = ggml_sycl_info ().devices [id].cc ;
3419
+ if (min_compute_capability > ggml_sycl_info ().infos [id].cc ) {
3420
+ min_compute_capability = ggml_sycl_info ().infos [id].cc ;
3422
3421
}
3423
3422
}
3424
3423
} else {
3425
- min_compute_capability = ggml_sycl_info ().devices [ctx.device ].cc ;
3424
+ min_compute_capability = ggml_sycl_info ().infos [ctx.device ].cc ;
3426
3425
}
3427
3426
3428
3427
// check data types and tensor shapes for custom matrix multiplication kernels:
@@ -4332,7 +4331,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
4332
4331
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type (int device_id) {
4333
4332
static std::mutex mutex;
4334
4333
std::lock_guard<std::mutex> lock (mutex);
4335
-
4336
4334
GGML_SYCL_DEBUG (" [SYCL] call ggml_backend_sycl_buffer_type\n " );
4337
4335
4338
4336
check_allow_device_id (device_id);
@@ -4342,10 +4340,9 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) {
4342
4340
static bool ggml_backend_sycl_buffer_type_initialized = false ;
4343
4341
4344
4342
if (!ggml_backend_sycl_buffer_type_initialized) {
4345
- for (int i = 0 ; i < ggml_sycl_info ().device_count ; i++) {
4346
- int id = ggml_backend_sycl_get_device_id (i);
4343
+ for (auto & id: ggml_sycl_info ().ids ) {
4347
4344
auto & device = dpct::dev_mgr::instance ().get_device (id);
4348
- queue_ptr stream = &(device. default_queue ()) ;
4345
+ queue_ptr stream = ggml_sycl_info (). infos [id]. qptrs [ 0 ] ;
4349
4346
ggml_backend_sycl_buffer_types[id] = {
4350
4347
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
4351
4348
/* .context = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string (id), stream},
@@ -4366,8 +4363,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_conte
4366
4363
static bool ggml_backend_sycl_buffer_type_initialized = false ;
4367
4364
4368
4365
if (!ggml_backend_sycl_buffer_type_initialized) {
4369
- for (int i = 0 ; i < ggml_sycl_info ().device_count ; i++) {
4370
- int id = ggml_backend_sycl_get_device_id (i);
4366
+ for (auto & id: ggml_sycl_info ().ids ) {
4371
4367
ggml_backend_sycl_buffer_types[id] = {
4372
4368
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
4373
4369
/* .context = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string (id), ctx->stream (id, 0 )},
@@ -4396,8 +4392,7 @@ static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tens
4396
4392
struct ggml_backend_sycl_split_buffer_context {
4397
4393
~ggml_backend_sycl_split_buffer_context () try {
4398
4394
for (ggml_tensor_extra_gpu * extra : tensor_extras) {
4399
- for (int i = 0 ; i < ggml_sycl_info ().device_count ; ++i) {
4400
- int id = ggml_backend_sycl_get_device_id (i);
4395
+ for (auto & id: ggml_sycl_info ().ids ) {
4401
4396
for (int64_t is = 0 ; is < GGML_SYCL_MAX_STREAMS; ++is) {
4402
4397
if (extra->events [id][is] != nullptr ) {
4403
4398
/*
@@ -5148,6 +5143,13 @@ GGML_CALL int ggml_backend_sycl_get_device_count() {
5148
5143
return ggml_sycl_info ().device_count ;
5149
5144
}
5150
5145
5146
+ GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode (int main_gpu_id) {
5147
+
5148
+ GGML_SYCL_DEBUG (" [SYCL] call ggml_backend_sycl_set_single_device_mode\n " );
5149
+ fprintf (stderr, " ggml_backend_sycl_set_single_device: use single device: [%d]\n " , main_gpu_id);
5150
+ ggml_sycl_info (main_gpu_id);
5151
+ }
5152
+
5151
5153
GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init (const char * params, void * user_data) {
5152
5154
ggml_backend_t sycl_backend = ggml_backend_sycl_init ((int ) (intptr_t ) user_data);
5153
5155
return sycl_backend;
@@ -5159,8 +5161,7 @@ extern "C" int ggml_backend_sycl_reg_devices();
5159
5161
5160
5162
int ggml_backend_sycl_reg_devices () {
5161
5163
assert (ggml_sycl_info ().device_count >0 );
5162
- for (int i = 0 ; i < ggml_sycl_info ().device_count ; i++) {
5163
- int id = ggml_backend_sycl_get_device_id (i);
5164
+ for (auto & id: ggml_sycl_info ().ids ) {
5164
5165
char name[128 ];
5165
5166
snprintf (name, sizeof (name), " %s%d" , GGML_SYCL_NAME, id);
5166
5167
ggml_backend_register (name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type (id), (void *) (intptr_t ) id);
0 commit comments