Skip to content

Commit 8dfc5a7

Browse files
committed
remove duplicate extra and global work group size
1 parent 19af285 commit 8dfc5a7

File tree

1 file changed

+14
-29
lines changed

1 file changed

+14
-29
lines changed

ggml-sycl.cpp

Lines changed: 14 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -6218,7 +6218,8 @@ static void norm_f32_sycl(const float *x, float *dst, const int ncols,
62186218
});
62196219
});
62206220
} else {
6221-
const int work_group_size = g_work_group_size;
6221+
// FIXME: 1024 from cuda
6222+
const int work_group_size = 1024;
62226223
const sycl::range<3> block_dims(1, 1, work_group_size);
62236224
/*
62246225
DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
@@ -6264,7 +6265,7 @@ static void group_norm_f32_sycl(const float *x, float *dst,
62646265
});
62656266
});
62666267
} else {
6267-
const int work_group_size = g_work_group_size;
6268+
const int work_group_size = 1024;
62686269
const sycl::range<3> block_dims(1, 1, work_group_size);
62696270
/*
62706271
DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
@@ -6353,7 +6354,7 @@ static void rms_norm_f32_sycl(const float *x, float *dst, const int ncols,
63536354
});
63546355
});
63556356
} else {
6356-
const int work_group_size = g_work_group_size;
6357+
const int work_group_size = 1024;
63576358
const sycl::range<3> block_dims(1, 1, work_group_size);
63586359
/*
63596360
DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
@@ -9187,7 +9188,7 @@ static void soft_max_f32_sycl(const float * x, const float * mask,
91879188
const int nrows_y, const float scale, const float max_bias,
91889189
queue_ptr stream) {
91899190
int nth = WARP_SIZE;
9190-
int max_block_size = g_work_group_size;
9191+
int max_block_size = 1024;
91919192
while (nth < ncols_x && nth < max_block_size) nth *= 2;
91929193
if (nth>max_block_size) nth = max_block_size;
91939194

@@ -11392,14 +11393,9 @@ static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const gg
1139211393
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
1139311394
queue_ptr main_stream = ctx.stream();
1139411395

11395-
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
11396-
void * src0_ddq = src0_extra->data_device[ctx.device];
11397-
11398-
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
11399-
float * src1_ddf = (float *) src1_extra->data_device[ctx.device];
11400-
11401-
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
11402-
float * dst_ddf = (float *) dst_extra->data_device[ctx.device];
11396+
void * src0_ddq = src0->data;
11397+
float * src1_ddf = (float *) src1->data;
11398+
float * dst_ddf = (float *) dst->data;
1140311399

1140411400
ggml_mul_mat_p021_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
1140511401
}
@@ -11430,15 +11426,10 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
1143011426

1143111427
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
1143211428
queue_ptr main_stream = ctx.stream();
11433-
11434-
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
11435-
void * src0_ddq = src0_extra->data_device[ctx.device];
11436-
11437-
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
11438-
float * src1_ddf = (float *) src1_extra->data_device[ctx.device];
11439-
11440-
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
11441-
float * dst_ddf = (float *) dst_extra->data_device[ctx.device];
11429+
11430+
void * src0_ddq = src0->data;
11431+
float * src1_ddf = (float *) src1->data;
11432+
float * dst_ddf = (float *) dst->data;
1144211433

1144311434
const int64_t row_stride_x = nb01 / sizeof(sycl::half);
1144411435
const int64_t channel_stride_x = nb02 / sizeof(sycl::half);
@@ -11982,9 +11973,6 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
1198211973
const int64_t ne = ggml_nelements(src0);
1198311974
GGML_ASSERT(ne == ggml_nelements(src1));
1198411975

11985-
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
11986-
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
11987-
1198811976
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
1198911977
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
1199011978

@@ -11993,11 +11981,8 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
1199311981
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
1199411982
queue_ptr main_stream = ctx.stream();
1199511983

11996-
const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
11997-
const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
11998-
11999-
char * src0_ddc = (char *) src0_extra->data_device[ctx.device];
12000-
char * src1_ddc = (char *) src1_extra->data_device[ctx.device];
11984+
char * src0_ddc = (char *) src0->data;
11985+
char * src1_ddc = (char *) src1->data;
1200111986

1200211987
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
1200311988
ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);

0 commit comments

Comments
 (0)