5
5
#include " ggml-cpu.h"
6
6
#endif
7
7
8
+ // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers
9
+ #define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
10
+
8
11
#include < vulkan/vulkan.hpp>
9
12
13
+ // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers
14
+ VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
15
+
10
16
#include < algorithm>
11
17
#include < cmath>
12
18
#include < iomanip>
@@ -90,6 +96,9 @@ struct vk_pipeline_struct {
90
96
bool needed {};
91
97
// set to true when the shader has been compiled
92
98
bool compiled {};
99
+ // number of registers used, extracted from pipeline executable properties
100
+ uint32_t register_count {};
101
+ std::vector<uint32_t > specialization_constants;
93
102
};
94
103
95
104
typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
@@ -184,6 +193,8 @@ struct vk_device_struct {
184
193
uint32_t coopmat_k;
185
194
bool coopmat2;
186
195
196
+ bool pipeline_executable_properties_support {};
197
+
187
198
size_t idx;
188
199
189
200
bool mul_mat_l[GGML_TYPE_COUNT];
@@ -893,6 +904,20 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
893
904
}
894
905
pipeline->compiled = true ;
895
906
907
+ if (device->pipeline_executable_properties_support ) {
908
+ vk::PipelineExecutableInfoKHR executableInfo;
909
+ executableInfo.pipeline = pipeline->pipeline ;
910
+
911
+ auto statistics = device->device .getPipelineExecutableStatisticsKHR (executableInfo);
912
+ for (auto & s : statistics) {
913
+ VK_LOG_DEBUG (pipeline->name << " " << s.name << " : " << s.value .u64 );
914
+ // "Register Count" is reported by NVIDIA drivers.
915
+ if (strcmp (s.name , " Register Count" ) == 0 ) {
916
+ pipeline->register_count = (uint32_t )s.value .u64 ;
917
+ }
918
+ }
919
+ }
920
+
896
921
{
897
922
std::lock_guard<std::mutex> guard (device->mutex );
898
923
device->pipelines .insert ({ pipeline->name , pipeline });
@@ -1581,6 +1606,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
1581
1606
pipeline->push_constant_size = push_constant_size;
1582
1607
pipeline->wg_denoms = wg_denoms;
1583
1608
pipeline->align = align;
1609
+ pipeline->specialization_constants = specialization_constants;
1584
1610
}
1585
1611
1586
1612
if (!pipeline->needed || pipeline->compiled ) {
@@ -2289,6 +2315,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
2289
2315
bool amd_shader_core_properties2 = false ;
2290
2316
bool pipeline_robustness = false ;
2291
2317
bool coopmat2_support = false ;
2318
+ bool pipeline_executable_properties_support = false ;
2292
2319
device->coopmat_support = false ;
2293
2320
2294
2321
// Check if maintenance4 is supported
@@ -2316,6 +2343,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
2316
2343
} else if (strcmp (" VK_NV_cooperative_matrix2" , properties.extensionName ) == 0 &&
2317
2344
!getenv (" GGML_VK_DISABLE_COOPMAT2" )) {
2318
2345
coopmat2_support = true ;
2346
+ } else if (strcmp (" VK_KHR_pipeline_executable_properties" , properties.extensionName ) == 0 ) {
2347
+ pipeline_executable_properties_support = true ;
2319
2348
}
2320
2349
}
2321
2350
@@ -2500,8 +2529,18 @@ static vk_device ggml_vk_get_device(size_t idx) {
2500
2529
device_extensions.push_back (" VK_KHR_maintenance4" );
2501
2530
}
2502
2531
2532
+ VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR pep_features {};
2533
+ pep_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR;
2534
+ if (pipeline_executable_properties_support) {
2535
+ last_struct->pNext = (VkBaseOutStructure *)&pep_features;
2536
+ last_struct = (VkBaseOutStructure *)&pep_features;
2537
+ device_extensions.push_back (" VK_KHR_pipeline_executable_properties" );
2538
+ }
2539
+
2503
2540
vkGetPhysicalDeviceFeatures2 (device->physical_device , &device_features2);
2504
2541
2542
+ device->pipeline_executable_properties_support = pipeline_executable_properties_support;
2543
+
2505
2544
device->fp16 = device->fp16 && vk12_features.shaderFloat16 ;
2506
2545
2507
2546
device->pipeline_robustness = pl_robustness_features.pipelineRobustness ;
@@ -2876,6 +2915,9 @@ static void ggml_vk_instance_init() {
2876
2915
}
2877
2916
VK_LOG_DEBUG (" ggml_vk_instance_init()" );
2878
2917
2918
+ // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers
2919
+ VULKAN_HPP_DEFAULT_DISPATCHER.init ();
2920
+
2879
2921
uint32_t api_version = vk::enumerateInstanceVersion ();
2880
2922
2881
2923
if (api_version < VK_API_VERSION_1_2) {
@@ -2928,6 +2970,9 @@ static void ggml_vk_instance_init() {
2928
2970
vk_instance.instance = vk::createInstance (instance_create_info);
2929
2971
vk_instance_initialized = true ;
2930
2972
2973
+ // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers
2974
+ VULKAN_HPP_DEFAULT_DISPATCHER.init (vk_instance.instance );
2975
+
2931
2976
size_t num_available_devices = vk_instance.instance .enumeratePhysicalDevices ().size ();
2932
2977
2933
2978
// Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
@@ -3832,12 +3877,21 @@ static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int
3832
3877
VK_LOG_DEBUG (" ggml_vk_guess_split_k(" << m << " , " << n << " , " << k << " )" );
3833
3878
3834
3879
uint32_t split_k = 1 ;
3835
- if (ctx->device ->shader_core_count != 0 && m >= (int )pipeline->wg_denoms [0 ] && n >= (int )pipeline->wg_denoms [1 ]) {
3836
- // If k is 'large' and the SMs will fill less than halfway, use split_k.
3880
+ if (ctx->device ->shader_core_count != 0 ) {
3837
3881
uint32_t m_tiles = CEIL_DIV (m, pipeline->wg_denoms [0 ]);
3838
3882
uint32_t n_tiles = CEIL_DIV (n, pipeline->wg_denoms [1 ]);
3839
- if (k >= 2048 && m_tiles * n_tiles < ctx->device ->shader_core_count / 2 ) {
3840
- split_k = ctx->device ->shader_core_count / (m_tiles * n_tiles);
3883
+ uint32_t occupancy_factor = 1 ;
3884
+ // Estimate how many workgroups can fit on an SM at a time.
3885
+ // Other factors like shared memory could affect this, and aren't taken into account.
3886
+ if (ctx->device ->vendor_id == VK_VENDOR_ID_NVIDIA && pipeline->register_count > 0 ) {
3887
+ uint32_t block_size = pipeline->specialization_constants [0 ];
3888
+ assert (block_size > 0 );
3889
+ occupancy_factor = 65536 / (block_size * pipeline->register_count );
3890
+ }
3891
+ // The extra factor of 4 is to try to run up to 4x as many workgroups as can fit,
3892
+ // to prefer shorter shaders that will be less prone to tail effects
3893
+ if (k >= 2048 && m_tiles * n_tiles < ctx->device ->shader_core_count * occupancy_factor * 4 ) {
3894
+ split_k = occupancy_factor * 4 * ctx->device ->shader_core_count / (m_tiles * n_tiles);
3841
3895
// Clamp to 2 or 4
3842
3896
split_k = std::min (split_k, 4u );
3843
3897
if (split_k == 3 ) {
@@ -4122,7 +4176,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
4122
4176
const int y_ne = padded_n * ne10;
4123
4177
const int d_ne = ne11 * ne01;
4124
4178
4125
- const uint32_t split_k = ggml_vk_guess_split_k (ctx, ne01, ne11, ne10, pipeline);
4179
+ uint32_t split_k = ggml_vk_guess_split_k (ctx, ne01, ne11, ne10, pipeline);
4126
4180
4127
4181
const uint64_t qx_sz = ggml_type_size (src0->type ) * x_ne / ggml_blck_size (src0->type );
4128
4182
const uint64_t qy_sz = ggml_type_size (src1->type ) * y_ne / ggml_blck_size (src1->type );
@@ -4146,10 +4200,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
4146
4200
GGML_ASSERT (!qx_needs_dequant || to_fp16_vk_0 != nullptr ); // NOLINT
4147
4201
GGML_ASSERT (!qy_needs_dequant || to_fp16_vk_1 != nullptr ); // NOLINT
4148
4202
4203
+ const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0 ;
4149
4204
if (dryrun) {
4150
4205
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
4151
4206
const uint64_t y_sz_upd = y_sz * ne12 * ne13;
4152
- const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0 ;
4153
4207
if (
4154
4208
(qx_needs_dequant && x_sz_upd > ctx->device ->max_memory_allocation_size ) ||
4155
4209
(qy_needs_dequant && y_sz_upd > ctx->device ->max_memory_allocation_size ) ||
@@ -4174,11 +4228,19 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
4174
4228
if (qy_needs_dequant) {
4175
4229
ggml_pipeline_request_descriptor_sets (ctx->device , to_fp16_vk_1, 1 );
4176
4230
}
4177
- if (split_k > 1 ) {
4231
+ // ggml_vk_guess_split_k may make a different determination after the pipeline
4232
+ // is compiled (based on register count), so prepare for split_k just in case.
4233
+ if (split_k > 1 || !pipeline->compiled ) {
4178
4234
ggml_pipeline_request_descriptor_sets (ctx->device , ctx->device ->pipeline_matmul_split_k_reduce , 1 );
4179
4235
}
4180
4236
return ;
4181
4237
}
4238
+ // ggml_vk_guess_split_k may make a different determination after the pipeline
4239
+ // is compiled (based on register count). Fallback to no split_k if we didn't
4240
+ // reserve enough memory.
4241
+ if (split_k_size > ctx->prealloc_size_split_k ) {
4242
+ split_k = 1 ;
4243
+ }
4182
4244
4183
4245
vk_buffer d_D = dst_buf_ctx->dev_buffer ;
4184
4246
const uint64_t d_buf_offset = vk_tensor_offset (dst) + dst->view_offs ;
0 commit comments