Helper function to set subgroup size

daniandtheweb · daniandtheweb · commit 14ea4fadbcd6 · 2025-02-14T22:22:32.000+01:00
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1423,6 +1423,36 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
     return supported;
 }
 
+// Define a configuration map per GPU.
+// Outer key: GPU identifier (e.g. "RX 5700").
+// Inner map: key is pipeline name; value is the subgroup size.
+static std::unordered_map<std::string, std::unordered_map<std::string, uint32_t>> gpu_pipeline_config = {
+    {"RX 5700", {
+        {"im2col_f32", 64},
+        {"im2col_f32_f16", 64}
+    }}
+};
+
+// Helper function defined at namespace scope.
+static uint32_t get_subgroup_size(const std::string &pipeline_name, const std::string &device_name) {
+    std::string foundKey;
+    for (const auto &entry : gpu_pipeline_config) {
+        if (device_name.find(entry.first) != std::string::npos) {
+            foundKey = entry.first;
+            break;
+        }
+    }
+    if (!foundKey.empty()) {
+        auto &pipelineMap = gpu_pipeline_config[foundKey];
+        auto pipIt = pipelineMap.find(pipeline_name);
+        if (pipIt != pipelineMap.end() && pipIt->second != 0) {
+            return pipIt->second;
+        }
+    }
+    // If not defined, return 0.
+    return 0;
+}
+
 static void ggml_vk_load_shaders(vk_device& device) {
     VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
 
@@ -1546,11 +1576,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
     vk::PhysicalDeviceProperties2 props2;
     device->physical_device.getProperties2(&props2);
     std::string device_name = props2.properties.deviceName.data();
+
     std::vector<std::future<void>> compiles;
     auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint,
                                               uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
                                               uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
 
+        required_subgroup_size = get_subgroup_size(name, device_name);
         if (required_subgroup_size == 0) {
             required_subgroup_size = (device_name.find("RX 5700") != std::string::npos) ? 32 : required_subgroup_size;
         }
@@ -1580,20 +1612,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
                                       parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size));
     };
 
-    // New lambda for pipelines with subgroup size 64.
-    auto const &ggml_vk_create_pipeline_64 = [&](vk_device& device, vk_pipeline& pipeline,
-        const std::string &name, size_t spv_size, const void* spv_data,
-        const std::string &entrypoint, uint32_t parameter_count,
-        uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms,
-        const std::vector<uint32_t>& specialization_constants, uint32_t align,
-        bool disable_robustness = false, bool require_full_subgroups = false)
-    {
-        ggml_vk_create_pipeline(device, pipeline, name, spv_size, spv_data, entrypoint,
-                                  parameter_count, push_constant_size, wg_denoms,
-                                  specialization_constants, align, disable_robustness,
-                                  require_full_subgroups, 64);
-    };
-
 #if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
     if (device->coopmat2) {
 
@@ -2172,21 +2190,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
 
     ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
 
-    // Workaround needed to speedup im2col on RX 5700
-    if (device_name.find("RX 5700")  != std::string::npos) {
-        ggml_vk_create_pipeline_64(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
-        if (device->float_controls_rte_fp16) {
-            ggml_vk_create_pipeline_64(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte_len, im2col_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
-        } else {
-            ggml_vk_create_pipeline_64(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
-        }
+    ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
+    if (device->float_controls_rte_fp16) {
+        ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte_len, im2col_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
     } else {
-        ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
-        if (device->float_controls_rte_fp16) {
-            ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte_len, im2col_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
-        } else {
-            ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
-        }
+        ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
     }
 
     ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);