Skip to content

Commit 90c9d04

Browse files
committed
opencl: fix couple crashes
* fix kernel launches failed on devices which do not support non-uniform work-groups. When non-uniform work-groups are not supported, set `local_work_size` to NULL (= let driver choose the work-group sizes). This patch does not cover everything - just the cases tested by test-backend-ops. * fix sub-buffer creation failed due to `cl_buffer_region::origin` not being aligned to `CL_DEVICE_MEM_BASE_ADDR_ALIGN`.
1 parent d0d5b22 commit 90c9d04

File tree

1 file changed

+71
-23
lines changed

1 file changed

+71
-23
lines changed

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 71 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,13 @@ struct ggml_cl_version {
6969
cl_uint minor = 0;
7070
};
7171

72+
static size_t align_to(size_t value, size_t to_alignment) {
73+
GGML_ASSERT(to_alignment && "Invalid alignment (must be non-zero)");
74+
GGML_ASSERT((to_alignment & (to_alignment - 1)) == 0 && "to_alignment must be power-of-two");
75+
76+
return ((value + to_alignment - 1) / to_alignment) * to_alignment;
77+
}
78+
7279
// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
7380
static ggml_cl_version parse_cl_version(std::string_view str) {
7481
size_t major_str_begin = 0;
@@ -218,6 +225,8 @@ struct ggml_backend_opencl_context {
218225

219226
int adreno_wave_size;
220227

228+
cl_bool non_uniform_workgroups;
229+
221230
cl_context context;
222231
cl_command_queue queue;
223232

@@ -655,6 +664,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
655664
GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
656665
svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
657666

667+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool),
668+
&backend_ctx->non_uniform_workgroups, 0));
669+
658670
// Print out configurations
659671
#ifdef GGML_OPENCL_SOA_Q
660672
GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
@@ -1546,15 +1558,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
15461558
// The original tensor memory is divided into scales and quants, i.e.,
15471559
// we first store scales, then quants.
15481560
// Create subbuffer for scales.
1549-
region.origin = extra_orig->offset + tensor->view_offs + offset;
1561+
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
15501562
region.size = size_d;
15511563
extra->d = clCreateSubBuffer(
15521564
extra_orig->data_device, CL_MEM_READ_WRITE,
15531565
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
15541566
CL_CHECK(err);
1567+
auto previous_origin = region.origin;
15551568

15561569
// Create subbuffer for quants.
1557-
region.origin = extra_orig->offset + tensor->view_offs + offset + size_d;
1570+
region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
15581571
region.size = size_q;
15591572
extra->q = clCreateSubBuffer(
15601573
extra_orig->data_device, CL_MEM_READ_WRITE,
@@ -2430,14 +2443,19 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
24302443
size_t global_work_size[] = {(size_t)n, 1, 1};
24312444
size_t local_work_size[] = {64, 1, 1};
24322445

2446+
size_t * local_work_size_ptr = local_work_size;
2447+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
2448+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
2449+
}
2450+
24332451
#ifdef GGML_OPENCL_PROFILING
24342452
cl_event evt;
2435-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
2453+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
24362454

24372455
g_profiling_info.emplace_back();
2438-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
2456+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
24392457
#else
2440-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
2458+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
24412459
#endif
24422460
} else {
24432461
unsigned int nth = MIN(64, ne0);
@@ -2565,14 +2583,19 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
25652583
size_t global_work_size[] = {(size_t)n, 1, 1};
25662584
size_t local_work_size[] = {64, 1, 1};
25672585

2586+
size_t * local_work_size_ptr = local_work_size;
2587+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
2588+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
2589+
}
2590+
25682591
#ifdef GGML_OPENCL_PROFILING
25692592
cl_event evt;
2570-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
2593+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
25712594

25722595
g_profiling_info.emplace_back();
2573-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
2596+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
25742597
#else
2575-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
2598+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
25762599
#endif
25772600
} else {
25782601
unsigned int nth = MIN(64, ne0);
@@ -2721,14 +2744,19 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
27212744
size_t global_work_size[] = {(size_t)n, 1, 1};
27222745
size_t local_work_size[] = {64, 1, 1};
27232746

2747+
size_t * local_work_size_ptr = local_work_size;
2748+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
2749+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
2750+
}
2751+
27242752
#ifdef GGML_OPENCL_PROFILING
27252753
cl_event evt;
2726-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
2754+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
27272755

27282756
g_profiling_info.emplace_back();
2729-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
2757+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
27302758
#else
2731-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
2759+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
27322760
#endif
27332761
}
27342762

@@ -2761,14 +2789,19 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
27612789
size_t global_work_size[] = {(size_t)n, 1, 1};
27622790
size_t local_work_size[] = {64, 1, 1};
27632791

2792+
size_t * local_work_size_ptr = local_work_size;
2793+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
2794+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
2795+
}
2796+
27642797
#ifdef GGML_OPENCL_PROFILING
27652798
cl_event evt;
2766-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
2799+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
27672800

27682801
g_profiling_info.emplace_back();
2769-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
2802+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
27702803
#else
2771-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
2804+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
27722805
#endif
27732806
}
27742807

@@ -2808,14 +2841,19 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
28082841
size_t global_work_size[] = {(size_t)n, 1, 1};
28092842
size_t local_work_size[] = {64, 1, 1};
28102843

2844+
size_t * local_work_size_ptr = local_work_size;
2845+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
2846+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
2847+
}
2848+
28112849
#ifdef GGML_OPENCL_PROFILING
28122850
cl_event evt;
2813-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
2851+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
28142852

28152853
g_profiling_info.emplace_back();
2816-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
2854+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
28172855
#else
2818-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
2856+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
28192857
#endif
28202858
}
28212859

@@ -3711,14 +3749,19 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
37113749
size_t global_work_size[] = {(size_t)n, 1, 1};
37123750
size_t local_work_size[] = {64, 1, 1};
37133751

3752+
size_t * local_work_size_ptr = local_work_size;
3753+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3754+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3755+
}
3756+
37143757
#ifdef GGML_OPENCL_PROFILING
37153758
cl_event evt;
3716-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3759+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
37173760

37183761
g_profiling_info.emplace_back();
3719-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3762+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
37203763
#else
3721-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3764+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
37223765
#endif
37233766
}
37243767

@@ -3899,14 +3942,19 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
38993942
size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02};
39003943
size_t local_work_size[] = {64, 1, 1};
39013944

3945+
size_t * local_work_size_ptr = local_work_size;
3946+
if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3947+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3948+
}
3949+
39023950
#ifdef GGML_OPENCL_PROFILING
39033951
cl_event evt;
3904-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3952+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
39053953

39063954
g_profiling_info.emplace_back();
3907-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3955+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
39083956
#else
3909-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3957+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
39103958
#endif
39113959
}
39123960
}

0 commit comments

Comments
 (0)