Skip to content

Commit edbf42e

Browse files
authored
opencl: fix couple crashes (#12795)
* opencl: fix couple crashes * fix kernel launches failed on devices which do not support non-uniform work-groups. When non-uniform work-groups are not supported, set `local_work_size` to NULL (= let driver choose the work-group sizes). This patch does not cover everything - just the cases tested by test-backend-ops. * fix sub-buffer creation failed due to `cl_buffer_region::origin` not being aligned to `CL_DEVICE_MEM_BASE_ADDR_ALIGN`. * OpenCL: query non-uniform WG sizes only on OpenCL 3.0+
1 parent d643bb2 commit edbf42e

File tree

1 file changed

+79
-23
lines changed

1 file changed

+79
-23
lines changed

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 79 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ struct ggml_cl_version {
7474
cl_uint minor = 0;
7575
};
7676

77+
7778
struct ggml_cl_compiler_version {
7879
ADRENO_CL_COMPILER_TYPE type;
7980
int major = -1;
@@ -91,6 +92,14 @@ struct ggml_cl_compiler_version {
9192
}
9293
};
9394

95+
static size_t align_to(size_t value, size_t to_alignment) {
96+
GGML_ASSERT(to_alignment && "Invalid alignment (must be non-zero)");
97+
GGML_ASSERT((to_alignment & (to_alignment - 1)) == 0 && "to_alignment must be power-of-two");
98+
99+
return ((value + to_alignment - 1) / to_alignment) * to_alignment;
100+
}
101+
102+
94103
// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
95104
static ggml_cl_version parse_cl_version(std::string_view str) {
96105
size_t major_str_begin = 0;
@@ -248,6 +257,8 @@ struct ggml_backend_opencl_context {
248257

249258
int adreno_wave_size;
250259

260+
cl_bool non_uniform_workgroups;
261+
251262
cl_context context;
252263
cl_command_queue queue;
253264

@@ -1397,6 +1408,15 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
13971408
GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
13981409
svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
13991410

1411+
if (opencl_c_version.major >= 3) {
1412+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool),
1413+
&backend_ctx->non_uniform_workgroups, 0));
1414+
} else {
1415+
GGML_ASSERT(opencl_c_version.major == 2);
1416+
// Non-uniform workgroup sizes is mandatory feature in v2.x.
1417+
backend_ctx->non_uniform_workgroups = true;
1418+
}
1419+
14001420
// Print out configurations
14011421
#ifdef GGML_OPENCL_SOA_Q
14021422
GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
@@ -2058,15 +2078,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
20582078
// The original tensor memory is divided into scales and quants, i.e.,
20592079
// we first store scales, then quants.
20602080
// Create subbuffer for scales.
2061-
region.origin = extra_orig->offset + tensor->view_offs + offset;
2081+
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
20622082
region.size = size_d;
20632083
extra->d = clCreateSubBuffer(
20642084
extra_orig->data_device, CL_MEM_READ_WRITE,
20652085
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
20662086
CL_CHECK(err);
2087+
auto previous_origin = region.origin;
20672088

20682089
// Create subbuffer for quants.
2069-
region.origin = extra_orig->offset + tensor->view_offs + offset + size_d;
2090+
region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
20702091
region.size = size_q;
20712092
extra->q = clCreateSubBuffer(
20722093
extra_orig->data_device, CL_MEM_READ_WRITE,
@@ -2942,14 +2963,19 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
29422963
size_t global_work_size[] = {(size_t)n, 1, 1};
29432964
size_t local_work_size[] = {64, 1, 1};
29442965

2966+
size_t * local_work_size_ptr = local_work_size;
2967+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
2968+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
2969+
}
2970+
29452971
#ifdef GGML_OPENCL_PROFILING
29462972
cl_event evt;
2947-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
2973+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
29482974

29492975
g_profiling_info.emplace_back();
2950-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
2976+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
29512977
#else
2952-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
2978+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
29532979
#endif
29542980
} else {
29552981
unsigned int nth = MIN(64, ne0);
@@ -3077,14 +3103,19 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
30773103
size_t global_work_size[] = {(size_t)n, 1, 1};
30783104
size_t local_work_size[] = {64, 1, 1};
30793105

3106+
size_t * local_work_size_ptr = local_work_size;
3107+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3108+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3109+
}
3110+
30803111
#ifdef GGML_OPENCL_PROFILING
30813112
cl_event evt;
3082-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3113+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
30833114

30843115
g_profiling_info.emplace_back();
3085-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3116+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
30863117
#else
3087-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3118+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
30883119
#endif
30893120
} else {
30903121
unsigned int nth = MIN(64, ne0);
@@ -3233,14 +3264,19 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
32333264
size_t global_work_size[] = {(size_t)n, 1, 1};
32343265
size_t local_work_size[] = {64, 1, 1};
32353266

3267+
size_t * local_work_size_ptr = local_work_size;
3268+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3269+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3270+
}
3271+
32363272
#ifdef GGML_OPENCL_PROFILING
32373273
cl_event evt;
3238-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3274+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
32393275

32403276
g_profiling_info.emplace_back();
3241-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3277+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
32423278
#else
3243-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3279+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
32443280
#endif
32453281
}
32463282

@@ -3273,14 +3309,19 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
32733309
size_t global_work_size[] = {(size_t)n, 1, 1};
32743310
size_t local_work_size[] = {64, 1, 1};
32753311

3312+
size_t * local_work_size_ptr = local_work_size;
3313+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3314+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3315+
}
3316+
32763317
#ifdef GGML_OPENCL_PROFILING
32773318
cl_event evt;
3278-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3319+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
32793320

32803321
g_profiling_info.emplace_back();
3281-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3322+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
32823323
#else
3283-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3324+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
32843325
#endif
32853326
}
32863327

@@ -3320,14 +3361,19 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
33203361
size_t global_work_size[] = {(size_t)n, 1, 1};
33213362
size_t local_work_size[] = {64, 1, 1};
33223363

3364+
size_t * local_work_size_ptr = local_work_size;
3365+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3366+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3367+
}
3368+
33233369
#ifdef GGML_OPENCL_PROFILING
33243370
cl_event evt;
3325-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3371+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
33263372

33273373
g_profiling_info.emplace_back();
3328-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3374+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
33293375
#else
3330-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3376+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
33313377
#endif
33323378
}
33333379

@@ -4230,14 +4276,19 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
42304276
size_t global_work_size[] = {(size_t)n, 1, 1};
42314277
size_t local_work_size[] = {64, 1, 1};
42324278

4279+
size_t * local_work_size_ptr = local_work_size;
4280+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
4281+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
4282+
}
4283+
42334284
#ifdef GGML_OPENCL_PROFILING
42344285
cl_event evt;
4235-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4286+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
42364287

42374288
g_profiling_info.emplace_back();
4238-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4289+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
42394290
#else
4240-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4291+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
42414292
#endif
42424293
}
42434294

@@ -4418,14 +4469,19 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
44184469
size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02};
44194470
size_t local_work_size[] = {64, 1, 1};
44204471

4472+
size_t * local_work_size_ptr = local_work_size;
4473+
if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
4474+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
4475+
}
4476+
44214477
#ifdef GGML_OPENCL_PROFILING
44224478
cl_event evt;
4423-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4479+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
44244480

44254481
g_profiling_info.emplace_back();
4426-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4482+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
44274483
#else
4428-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4484+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
44294485
#endif
44304486
}
44314487
}

0 commit comments

Comments
 (0)