@@ -6218,7 +6218,8 @@ static void norm_f32_sycl(const float *x, float *dst, const int ncols,
6218
6218
});
6219
6219
});
6220
6220
} else {
6221
- const int work_group_size = g_work_group_size;
6221
+ // FIXME: 1024 from cuda
6222
+ const int work_group_size = 1024;
6222
6223
const sycl::range<3> block_dims(1, 1, work_group_size);
6223
6224
/*
6224
6225
DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
@@ -6264,7 +6265,7 @@ static void group_norm_f32_sycl(const float *x, float *dst,
6264
6265
});
6265
6266
});
6266
6267
} else {
6267
- const int work_group_size = g_work_group_size ;
6268
+ const int work_group_size = 1024 ;
6268
6269
const sycl::range<3> block_dims(1, 1, work_group_size);
6269
6270
/*
6270
6271
DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
@@ -6353,7 +6354,7 @@ static void rms_norm_f32_sycl(const float *x, float *dst, const int ncols,
6353
6354
});
6354
6355
});
6355
6356
} else {
6356
- const int work_group_size = g_work_group_size ;
6357
+ const int work_group_size = 1024 ;
6357
6358
const sycl::range<3> block_dims(1, 1, work_group_size);
6358
6359
/*
6359
6360
DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
@@ -9187,7 +9188,7 @@ static void soft_max_f32_sycl(const float * x, const float * mask,
9187
9188
const int nrows_y, const float scale, const float max_bias,
9188
9189
queue_ptr stream) {
9189
9190
int nth = WARP_SIZE;
9190
- int max_block_size = g_work_group_size ;
9191
+ int max_block_size = 1024 ;
9191
9192
while (nth < ncols_x && nth < max_block_size) nth *= 2;
9192
9193
if (nth>max_block_size) nth = max_block_size;
9193
9194
@@ -11392,14 +11393,9 @@ static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const gg
11392
11393
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
11393
11394
queue_ptr main_stream = ctx.stream();
11394
11395
11395
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
11396
- void * src0_ddq = src0_extra->data_device[ctx.device];
11397
-
11398
- ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
11399
- float * src1_ddf = (float *) src1_extra->data_device[ctx.device];
11400
-
11401
- ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
11402
- float * dst_ddf = (float *) dst_extra->data_device[ctx.device];
11396
+ void * src0_ddq = src0->data;
11397
+ float * src1_ddf = (float *) src1->data;
11398
+ float * dst_ddf = (float *) dst->data;
11403
11399
11404
11400
ggml_mul_mat_p021_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
11405
11401
}
@@ -11430,15 +11426,10 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
11430
11426
11431
11427
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
11432
11428
queue_ptr main_stream = ctx.stream();
11433
-
11434
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
11435
- void * src0_ddq = src0_extra->data_device[ctx.device];
11436
-
11437
- ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
11438
- float * src1_ddf = (float *) src1_extra->data_device[ctx.device];
11439
-
11440
- ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
11441
- float * dst_ddf = (float *) dst_extra->data_device[ctx.device];
11429
+
11430
+ void * src0_ddq = src0->data;
11431
+ float * src1_ddf = (float *) src1->data;
11432
+ float * dst_ddf = (float *) dst->data;
11442
11433
11443
11434
const int64_t row_stride_x = nb01 / sizeof(sycl::half);
11444
11435
const int64_t channel_stride_x = nb02 / sizeof(sycl::half);
@@ -11982,9 +11973,6 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
11982
11973
const int64_t ne = ggml_nelements(src0);
11983
11974
GGML_ASSERT(ne == ggml_nelements(src1));
11984
11975
11985
- GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
11986
- GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
11987
-
11988
11976
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
11989
11977
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
11990
11978
@@ -11993,11 +11981,8 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
11993
11981
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
11994
11982
queue_ptr main_stream = ctx.stream();
11995
11983
11996
- const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
11997
- const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
11998
-
11999
- char * src0_ddc = (char *) src0_extra->data_device[ctx.device];
12000
- char * src1_ddc = (char *) src1_extra->data_device[ctx.device];
11984
+ char * src0_ddc = (char *) src0->data;
11985
+ char * src1_ddc = (char *) src1->data;
12001
11986
12002
11987
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
12003
11988
ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
0 commit comments