Skip to content

Commit 20379a2

Browse files
committed
vulkan: query register count and use it in a better split_k heuristic
Use VK_KHR_pipeline_executable_properties to query the register count, and use that to try to better estimate how many workgroups can fit in the SMs. Particularly with recent tile size changes (#12258) the old heuristic is out of date. This heuristic benefits both coopmat1 and coopmat2 paths on NVIDIA. Would be good if somebody can hook up the missing details for other hardware. Calling getPipelineExecutableStatisticsKHR required more fully initializing Vulkan-HPP. The steps needed are documented in the Vulkan-HPP readme.
1 parent a768a65 commit 20379a2

File tree

1 file changed

+69
-7
lines changed

1 file changed

+69
-7
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 69 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,14 @@
55
#include "ggml-cpu.h"
66
#endif
77

8+
// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers
9+
#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
10+
811
#include <vulkan/vulkan.hpp>
912

13+
// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers
14+
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
15+
1016
#include <algorithm>
1117
#include <cmath>
1218
#include <iomanip>
@@ -90,6 +96,9 @@ struct vk_pipeline_struct {
9096
bool needed {};
9197
// set to true when the shader has been compiled
9298
bool compiled {};
99+
// number of registers used, extracted from pipeline executable properties
100+
uint32_t register_count {};
101+
std::vector<uint32_t> specialization_constants;
93102
};
94103

95104
typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
@@ -184,6 +193,8 @@ struct vk_device_struct {
184193
uint32_t coopmat_k;
185194
bool coopmat2;
186195

196+
bool pipeline_executable_properties_support {};
197+
187198
size_t idx;
188199

189200
bool mul_mat_l[GGML_TYPE_COUNT];
@@ -893,6 +904,20 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
893904
}
894905
pipeline->compiled = true;
895906

907+
if (device->pipeline_executable_properties_support) {
908+
vk::PipelineExecutableInfoKHR executableInfo;
909+
executableInfo.pipeline = pipeline->pipeline;
910+
911+
auto statistics = device->device.getPipelineExecutableStatisticsKHR(executableInfo);
912+
for (auto & s : statistics) {
913+
VK_LOG_DEBUG(pipeline->name << " " << s.name << ": " << s.value.u64);
914+
// "Register Count" is reported by NVIDIA drivers.
915+
if (strcmp(s.name, "Register Count") == 0) {
916+
pipeline->register_count = (uint32_t)s.value.u64;
917+
}
918+
}
919+
}
920+
896921
{
897922
std::lock_guard<std::mutex> guard(device->mutex);
898923
device->pipelines.insert({ pipeline->name, pipeline });
@@ -1581,6 +1606,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
15811606
pipeline->push_constant_size = push_constant_size;
15821607
pipeline->wg_denoms = wg_denoms;
15831608
pipeline->align = align;
1609+
pipeline->specialization_constants = specialization_constants;
15841610
}
15851611

15861612
if (!pipeline->needed || pipeline->compiled) {
@@ -2289,6 +2315,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
22892315
bool amd_shader_core_properties2 = false;
22902316
bool pipeline_robustness = false;
22912317
bool coopmat2_support = false;
2318+
bool pipeline_executable_properties_support = false;
22922319
device->coopmat_support = false;
22932320

22942321
// Check if maintenance4 is supported
@@ -2316,6 +2343,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
23162343
} else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 &&
23172344
!getenv("GGML_VK_DISABLE_COOPMAT2")) {
23182345
coopmat2_support = true;
2346+
} else if (strcmp("VK_KHR_pipeline_executable_properties", properties.extensionName) == 0) {
2347+
pipeline_executable_properties_support = true;
23192348
}
23202349
}
23212350

@@ -2500,8 +2529,18 @@ static vk_device ggml_vk_get_device(size_t idx) {
25002529
device_extensions.push_back("VK_KHR_maintenance4");
25012530
}
25022531

2532+
VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR pep_features {};
2533+
pep_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR;
2534+
if (pipeline_executable_properties_support) {
2535+
last_struct->pNext = (VkBaseOutStructure *)&pep_features;
2536+
last_struct = (VkBaseOutStructure *)&pep_features;
2537+
device_extensions.push_back("VK_KHR_pipeline_executable_properties");
2538+
}
2539+
25032540
vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2);
25042541

2542+
device->pipeline_executable_properties_support = pipeline_executable_properties_support;
2543+
25052544
device->fp16 = device->fp16 && vk12_features.shaderFloat16;
25062545

25072546
device->pipeline_robustness = pl_robustness_features.pipelineRobustness;
@@ -2876,6 +2915,9 @@ static void ggml_vk_instance_init() {
28762915
}
28772916
VK_LOG_DEBUG("ggml_vk_instance_init()");
28782917

2918+
// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers
2919+
VULKAN_HPP_DEFAULT_DISPATCHER.init();
2920+
28792921
uint32_t api_version = vk::enumerateInstanceVersion();
28802922

28812923
if (api_version < VK_API_VERSION_1_2) {
@@ -2928,6 +2970,9 @@ static void ggml_vk_instance_init() {
29282970
vk_instance.instance = vk::createInstance(instance_create_info);
29292971
vk_instance_initialized = true;
29302972

2973+
// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers
2974+
VULKAN_HPP_DEFAULT_DISPATCHER.init(vk_instance.instance);
2975+
29312976
size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
29322977

29332978
// Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
@@ -3832,12 +3877,21 @@ static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int
38323877
VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");
38333878

38343879
uint32_t split_k = 1;
3835-
if (ctx->device->shader_core_count != 0 && m >= (int)pipeline->wg_denoms[0] && n >= (int)pipeline->wg_denoms[1]) {
3836-
// If k is 'large' and the SMs will fill less than halfway, use split_k.
3880+
if (ctx->device->shader_core_count != 0) {
38373881
uint32_t m_tiles = CEIL_DIV(m, pipeline->wg_denoms[0]);
38383882
uint32_t n_tiles = CEIL_DIV(n, pipeline->wg_denoms[1]);
3839-
if (k >= 2048 && m_tiles * n_tiles < ctx->device->shader_core_count / 2) {
3840-
split_k = ctx->device->shader_core_count / (m_tiles * n_tiles);
3883+
uint32_t occupancy_factor = 1;
3884+
// Estimate how many workgroups can fit on an SM at a time.
3885+
// Other factors like shared memory could affect this, and aren't taken into account.
3886+
if (ctx->device->vendor_id == VK_VENDOR_ID_NVIDIA && pipeline->register_count > 0) {
3887+
uint32_t block_size = pipeline->specialization_constants[0];
3888+
assert(block_size > 0);
3889+
occupancy_factor = 65536 / (block_size * pipeline->register_count);
3890+
}
3891+
// The extra factor of 4 is to try to run up to 4x as many workgroups as can fit,
3892+
// to prefer shorter shaders that will be less prone to tail effects
3893+
if (k >= 2048 && m_tiles * n_tiles < ctx->device->shader_core_count * occupancy_factor * 4) {
3894+
split_k = occupancy_factor * 4 * ctx->device->shader_core_count / (m_tiles * n_tiles);
38413895
// Clamp to 2 or 4
38423896
split_k = std::min(split_k, 4u);
38433897
if (split_k == 3) {
@@ -4122,7 +4176,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
41224176
const int y_ne = padded_n * ne10;
41234177
const int d_ne = ne11 * ne01;
41244178

4125-
const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline);
4179+
uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline);
41264180

41274181
const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
41284182
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
@@ -4146,10 +4200,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
41464200
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
41474201
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
41484202

4203+
const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0;
41494204
if (dryrun) {
41504205
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
41514206
const uint64_t y_sz_upd = y_sz * ne12 * ne13;
4152-
const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0;
41534207
if (
41544208
(qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
41554209
(qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size) ||
@@ -4174,11 +4228,19 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
41744228
if (qy_needs_dequant) {
41754229
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
41764230
}
4177-
if (split_k > 1) {
4231+
// ggml_vk_guess_split_k may make a different determination after the pipeline
4232+
// is compiled (based on register count), so prepare for split_k just in case.
4233+
if (split_k > 1 || !pipeline->compiled) {
41784234
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, 1);
41794235
}
41804236
return;
41814237
}
4238+
// ggml_vk_guess_split_k may make a different determination after the pipeline
4239+
// is compiled (based on register count). Fallback to no split_k if we didn't
4240+
// reserve enough memory.
4241+
if (split_k_size > ctx->prealloc_size_split_k) {
4242+
split_k = 1;
4243+
}
41824244

41834245
vk_buffer d_D = dst_buf_ctx->dev_buffer;
41844246
const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;

0 commit comments

Comments
 (0)