@@ -74,6 +74,7 @@ struct ggml_cl_version {
74
74
cl_uint minor = 0 ;
75
75
};
76
76
77
+
77
78
struct ggml_cl_compiler_version {
78
79
ADRENO_CL_COMPILER_TYPE type;
79
80
int major = -1 ;
@@ -91,6 +92,14 @@ struct ggml_cl_compiler_version {
91
92
}
92
93
};
93
94
95
+ static size_t align_to (size_t value, size_t to_alignment) {
96
+ GGML_ASSERT (to_alignment && " Invalid alignment (must be non-zero)" );
97
+ GGML_ASSERT ((to_alignment & (to_alignment - 1 )) == 0 && " to_alignment must be power-of-two" );
98
+
99
+ return ((value + to_alignment - 1 ) / to_alignment) * to_alignment;
100
+ }
101
+
102
+
94
103
// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
95
104
static ggml_cl_version parse_cl_version (std::string_view str) {
96
105
size_t major_str_begin = 0 ;
@@ -248,6 +257,8 @@ struct ggml_backend_opencl_context {
248
257
249
258
int adreno_wave_size;
250
259
260
+ cl_bool non_uniform_workgroups;
261
+
251
262
cl_context context;
252
263
cl_command_queue queue;
253
264
@@ -1397,6 +1408,15 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1397
1408
GGML_LOG_INFO (" ggml_opencl: SVM atomics support: %s\n " ,
1398
1409
svm_caps & CL_DEVICE_SVM_ATOMICS ? " true" : " false" );
1399
1410
1411
+ if (opencl_c_version.major >= 3 ) {
1412
+ CL_CHECK (clGetDeviceInfo (device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof (cl_bool),
1413
+ &backend_ctx->non_uniform_workgroups , 0 ));
1414
+ } else {
1415
+ GGML_ASSERT (opencl_c_version.major == 2 );
1416
+ // Non-uniform workgroup sizes is mandatory feature in v2.x.
1417
+ backend_ctx->non_uniform_workgroups = true ;
1418
+ }
1419
+
1400
1420
// Print out configurations
1401
1421
#ifdef GGML_OPENCL_SOA_Q
1402
1422
GGML_LOG_INFO (" ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n " );
@@ -2058,15 +2078,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
2058
2078
// The original tensor memory is divided into scales and quants, i.e.,
2059
2079
// we first store scales, then quants.
2060
2080
// Create subbuffer for scales.
2061
- region.origin = extra_orig->offset + tensor->view_offs + offset;
2081
+ region.origin = align_to ( extra_orig->offset + tensor->view_offs + offset, backend_ctx-> alignment ) ;
2062
2082
region.size = size_d;
2063
2083
extra->d = clCreateSubBuffer (
2064
2084
extra_orig->data_device , CL_MEM_READ_WRITE,
2065
2085
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
2066
2086
CL_CHECK (err);
2087
+ auto previous_origin = region.origin ;
2067
2088
2068
2089
// Create subbuffer for quants.
2069
- region.origin = extra_orig-> offset + tensor-> view_offs + offset + size_d ;
2090
+ region.origin = align_to (previous_origin + size_d, backend_ctx-> alignment ) ;
2070
2091
region.size = size_q;
2071
2092
extra->q = clCreateSubBuffer (
2072
2093
extra_orig->data_device , CL_MEM_READ_WRITE,
@@ -2942,14 +2963,19 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
2942
2963
size_t global_work_size[] = {(size_t )n, 1 , 1 };
2943
2964
size_t local_work_size[] = {64 , 1 , 1 };
2944
2965
2966
+ size_t * local_work_size_ptr = local_work_size;
2967
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
2968
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
2969
+ }
2970
+
2945
2971
#ifdef GGML_OPENCL_PROFILING
2946
2972
cl_event evt;
2947
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
2973
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
2948
2974
2949
2975
g_profiling_info.emplace_back ();
2950
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
2976
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
2951
2977
#else
2952
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
2978
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
2953
2979
#endif
2954
2980
} else {
2955
2981
unsigned int nth = MIN (64 , ne0);
@@ -3077,14 +3103,19 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
3077
3103
size_t global_work_size[] = {(size_t )n, 1 , 1 };
3078
3104
size_t local_work_size[] = {64 , 1 , 1 };
3079
3105
3106
+ size_t * local_work_size_ptr = local_work_size;
3107
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
3108
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
3109
+ }
3110
+
3080
3111
#ifdef GGML_OPENCL_PROFILING
3081
3112
cl_event evt;
3082
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
3113
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
3083
3114
3084
3115
g_profiling_info.emplace_back ();
3085
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
3116
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
3086
3117
#else
3087
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
3118
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
3088
3119
#endif
3089
3120
} else {
3090
3121
unsigned int nth = MIN (64 , ne0);
@@ -3233,14 +3264,19 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
3233
3264
size_t global_work_size[] = {(size_t )n, 1 , 1 };
3234
3265
size_t local_work_size[] = {64 , 1 , 1 };
3235
3266
3267
+ size_t * local_work_size_ptr = local_work_size;
3268
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
3269
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
3270
+ }
3271
+
3236
3272
#ifdef GGML_OPENCL_PROFILING
3237
3273
cl_event evt;
3238
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
3274
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
3239
3275
3240
3276
g_profiling_info.emplace_back ();
3241
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
3277
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
3242
3278
#else
3243
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
3279
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
3244
3280
#endif
3245
3281
}
3246
3282
@@ -3273,14 +3309,19 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
3273
3309
size_t global_work_size[] = {(size_t )n, 1 , 1 };
3274
3310
size_t local_work_size[] = {64 , 1 , 1 };
3275
3311
3312
+ size_t * local_work_size_ptr = local_work_size;
3313
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
3314
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
3315
+ }
3316
+
3276
3317
#ifdef GGML_OPENCL_PROFILING
3277
3318
cl_event evt;
3278
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
3319
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
3279
3320
3280
3321
g_profiling_info.emplace_back ();
3281
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
3322
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
3282
3323
#else
3283
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
3324
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
3284
3325
#endif
3285
3326
}
3286
3327
@@ -3320,14 +3361,19 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
3320
3361
size_t global_work_size[] = {(size_t )n, 1 , 1 };
3321
3362
size_t local_work_size[] = {64 , 1 , 1 };
3322
3363
3364
+ size_t * local_work_size_ptr = local_work_size;
3365
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
3366
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
3367
+ }
3368
+
3323
3369
#ifdef GGML_OPENCL_PROFILING
3324
3370
cl_event evt;
3325
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
3371
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
3326
3372
3327
3373
g_profiling_info.emplace_back ();
3328
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
3374
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
3329
3375
#else
3330
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
3376
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
3331
3377
#endif
3332
3378
}
3333
3379
@@ -4230,14 +4276,19 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
4230
4276
size_t global_work_size[] = {(size_t )n, 1 , 1 };
4231
4277
size_t local_work_size[] = {64 , 1 , 1 };
4232
4278
4279
+ size_t * local_work_size_ptr = local_work_size;
4280
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
4281
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
4282
+ }
4283
+
4233
4284
#ifdef GGML_OPENCL_PROFILING
4234
4285
cl_event evt;
4235
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
4286
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
4236
4287
4237
4288
g_profiling_info.emplace_back ();
4238
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
4289
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
4239
4290
#else
4240
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
4291
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
4241
4292
#endif
4242
4293
}
4243
4294
@@ -4418,14 +4469,19 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
4418
4469
size_t global_work_size[] = {(size_t )ne00, (size_t )ne01, (size_t )ne02};
4419
4470
size_t local_work_size[] = {64 , 1 , 1 };
4420
4471
4472
+ size_t * local_work_size_ptr = local_work_size;
4473
+ if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
4474
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
4475
+ }
4476
+
4421
4477
#ifdef GGML_OPENCL_PROFILING
4422
4478
cl_event evt;
4423
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
4479
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
4424
4480
4425
4481
g_profiling_info.emplace_back ();
4426
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
4482
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
4427
4483
#else
4428
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
4484
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
4429
4485
#endif
4430
4486
}
4431
4487
}
0 commit comments