@@ -69,6 +69,13 @@ struct ggml_cl_version {
69
69
cl_uint minor = 0 ;
70
70
};
71
71
72
+ static size_t align_to (size_t value, size_t to_alignment) {
73
+ GGML_ASSERT (to_alignment && " Invalid alignment (must be non-zero)" );
74
+ GGML_ASSERT ((to_alignment & (to_alignment - 1 )) == 0 && " to_alignment must be power-of-two" );
75
+
76
+ return ((value + to_alignment - 1 ) / to_alignment) * to_alignment;
77
+ }
78
+
72
79
// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
73
80
static ggml_cl_version parse_cl_version (std::string_view str) {
74
81
size_t major_str_begin = 0 ;
@@ -218,6 +225,8 @@ struct ggml_backend_opencl_context {
218
225
219
226
int adreno_wave_size;
220
227
228
+ cl_bool non_uniform_workgroups;
229
+
221
230
cl_context context;
222
231
cl_command_queue queue;
223
232
@@ -655,6 +664,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
655
664
GGML_LOG_INFO (" ggml_opencl: SVM atomics support: %s\n " ,
656
665
svm_caps & CL_DEVICE_SVM_ATOMICS ? " true" : " false" );
657
666
667
+ CL_CHECK (clGetDeviceInfo (device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof (cl_bool),
668
+ &backend_ctx->non_uniform_workgroups , 0 ));
669
+
658
670
// Print out configurations
659
671
#ifdef GGML_OPENCL_SOA_Q
660
672
GGML_LOG_INFO (" ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n " );
@@ -1546,15 +1558,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1546
1558
// The original tensor memory is divided into scales and quants, i.e.,
1547
1559
// we first store scales, then quants.
1548
1560
// Create subbuffer for scales.
1549
- region.origin = extra_orig->offset + tensor->view_offs + offset;
1561
+ region.origin = align_to ( extra_orig->offset + tensor->view_offs + offset, backend_ctx-> alignment ) ;
1550
1562
region.size = size_d;
1551
1563
extra->d = clCreateSubBuffer (
1552
1564
extra_orig->data_device , CL_MEM_READ_WRITE,
1553
1565
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
1554
1566
CL_CHECK (err);
1567
+ auto previous_origin = region.origin ;
1555
1568
1556
1569
// Create subbuffer for quants.
1557
- region.origin = extra_orig-> offset + tensor-> view_offs + offset + size_d ;
1570
+ region.origin = align_to (previous_origin + size_d, backend_ctx-> alignment ) ;
1558
1571
region.size = size_q;
1559
1572
extra->q = clCreateSubBuffer (
1560
1573
extra_orig->data_device , CL_MEM_READ_WRITE,
@@ -2430,14 +2443,19 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
2430
2443
size_t global_work_size[] = {(size_t )n, 1 , 1 };
2431
2444
size_t local_work_size[] = {64 , 1 , 1 };
2432
2445
2446
+ size_t * local_work_size_ptr = local_work_size;
2447
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
2448
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
2449
+ }
2450
+
2433
2451
#ifdef GGML_OPENCL_PROFILING
2434
2452
cl_event evt;
2435
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
2453
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
2436
2454
2437
2455
g_profiling_info.emplace_back ();
2438
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
2456
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
2439
2457
#else
2440
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
2458
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
2441
2459
#endif
2442
2460
} else {
2443
2461
unsigned int nth = MIN (64 , ne0);
@@ -2565,14 +2583,19 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
2565
2583
size_t global_work_size[] = {(size_t )n, 1 , 1 };
2566
2584
size_t local_work_size[] = {64 , 1 , 1 };
2567
2585
2586
+ size_t * local_work_size_ptr = local_work_size;
2587
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
2588
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
2589
+ }
2590
+
2568
2591
#ifdef GGML_OPENCL_PROFILING
2569
2592
cl_event evt;
2570
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
2593
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
2571
2594
2572
2595
g_profiling_info.emplace_back ();
2573
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
2596
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
2574
2597
#else
2575
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
2598
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
2576
2599
#endif
2577
2600
} else {
2578
2601
unsigned int nth = MIN (64 , ne0);
@@ -2721,14 +2744,19 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
2721
2744
size_t global_work_size[] = {(size_t )n, 1 , 1 };
2722
2745
size_t local_work_size[] = {64 , 1 , 1 };
2723
2746
2747
+ size_t * local_work_size_ptr = local_work_size;
2748
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
2749
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
2750
+ }
2751
+
2724
2752
#ifdef GGML_OPENCL_PROFILING
2725
2753
cl_event evt;
2726
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
2754
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
2727
2755
2728
2756
g_profiling_info.emplace_back ();
2729
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
2757
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
2730
2758
#else
2731
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
2759
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
2732
2760
#endif
2733
2761
}
2734
2762
@@ -2761,14 +2789,19 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
2761
2789
size_t global_work_size[] = {(size_t )n, 1 , 1 };
2762
2790
size_t local_work_size[] = {64 , 1 , 1 };
2763
2791
2792
+ size_t * local_work_size_ptr = local_work_size;
2793
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
2794
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
2795
+ }
2796
+
2764
2797
#ifdef GGML_OPENCL_PROFILING
2765
2798
cl_event evt;
2766
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
2799
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
2767
2800
2768
2801
g_profiling_info.emplace_back ();
2769
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
2802
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
2770
2803
#else
2771
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
2804
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
2772
2805
#endif
2773
2806
}
2774
2807
@@ -2808,14 +2841,19 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
2808
2841
size_t global_work_size[] = {(size_t )n, 1 , 1 };
2809
2842
size_t local_work_size[] = {64 , 1 , 1 };
2810
2843
2844
+ size_t * local_work_size_ptr = local_work_size;
2845
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
2846
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
2847
+ }
2848
+
2811
2849
#ifdef GGML_OPENCL_PROFILING
2812
2850
cl_event evt;
2813
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
2851
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
2814
2852
2815
2853
g_profiling_info.emplace_back ();
2816
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
2854
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
2817
2855
#else
2818
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
2856
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
2819
2857
#endif
2820
2858
}
2821
2859
@@ -3711,14 +3749,19 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
3711
3749
size_t global_work_size[] = {(size_t )n, 1 , 1 };
3712
3750
size_t local_work_size[] = {64 , 1 , 1 };
3713
3751
3752
+ size_t * local_work_size_ptr = local_work_size;
3753
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
3754
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
3755
+ }
3756
+
3714
3757
#ifdef GGML_OPENCL_PROFILING
3715
3758
cl_event evt;
3716
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
3759
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
3717
3760
3718
3761
g_profiling_info.emplace_back ();
3719
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
3762
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
3720
3763
#else
3721
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
3764
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
3722
3765
#endif
3723
3766
}
3724
3767
@@ -3899,14 +3942,19 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
3899
3942
size_t global_work_size[] = {(size_t )ne00, (size_t )ne01, (size_t )ne02};
3900
3943
size_t local_work_size[] = {64 , 1 , 1 };
3901
3944
3945
+ size_t * local_work_size_ptr = local_work_size;
3946
+ if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
3947
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
3948
+ }
3949
+
3902
3950
#ifdef GGML_OPENCL_PROFILING
3903
3951
cl_event evt;
3904
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
3952
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
3905
3953
3906
3954
g_profiling_info.emplace_back ();
3907
- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
3955
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
3908
3956
#else
3909
- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
3957
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
3910
3958
#endif
3911
3959
}
3912
3960
}
0 commit comments