@@ -168,14 +168,19 @@ struct vk_device_struct {
168
168
uint32_t subgroup_size;
169
169
uint32_t shader_core_count;
170
170
bool uma;
171
- bool coopmat2;
171
+
172
+ bool subgroup_size_control;
173
+ uint32_t subgroup_min_size;
174
+ uint32_t subgroup_max_size;
175
+ bool subgroup_require_full_support;
172
176
173
177
bool coopmat_support;
174
178
bool coopmat_acc_f32_support;
175
179
bool coopmat_acc_f16_support;
176
180
uint32_t coopmat_m;
177
181
uint32_t coopmat_n;
178
182
uint32_t coopmat_k;
183
+ bool coopmat2;
179
184
180
185
size_t idx;
181
186
@@ -753,8 +758,12 @@ static uint32_t compile_count = 0;
753
758
static std::mutex compile_count_mutex;
754
759
static std::condition_variable compile_count_cond;
755
760
756
- static void ggml_vk_create_pipeline_func (vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void * spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t , 3 > wg_denoms, std::vector<uint32_t > specialization_constants, uint32_t align, bool disable_robustness) {
757
- VK_LOG_DEBUG (" ggml_vk_create_pipeline(" << device->name << " , " << name << " , " << entrypoint << " , " << parameter_count << " , " << push_constant_size << " , (" << wg_denoms[0 ] << " ," << wg_denoms[1 ] << " ," << wg_denoms[2 ] << " ), specialization_constants, " << align << " )" );
761
+ static void ggml_vk_create_pipeline_func (vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void * spv_data, const std::string entrypoint,
762
+ uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t , 3 > wg_denoms, std::vector<uint32_t > specialization_constants,
763
+ uint32_t align, bool disable_robustness, bool require_full_subgroups, uint32_t required_subgroup_size) {
764
+ VK_LOG_DEBUG (" ggml_vk_create_pipeline(" << device->name << " , " << name << " , " << entrypoint << " , " << parameter_count << " , " << push_constant_size <<
765
+ " , (" << wg_denoms[0 ] << " ," << wg_denoms[1 ] << " ," << wg_denoms[2 ] << " ), specialization_constants, " << align <<
766
+ " , " << disable_robustness << " , " << require_full_subgroups << " , " << required_subgroup_size << " )" );
758
767
GGML_ASSERT (parameter_count > 0 );
759
768
GGML_ASSERT (wg_denoms[0 ] > 0 && wg_denoms[1 ] > 0 && wg_denoms[2 ] > 0 ); // NOLINT
760
769
@@ -813,14 +822,28 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
813
822
specialization_constants.data ()
814
823
);
815
824
825
+ vk::PipelineShaderStageCreateFlags pipeline_shader_stage_create_flags{};
826
+
827
+ if (device->subgroup_require_full_support && require_full_subgroups) {
828
+ pipeline_shader_stage_create_flags |= vk::PipelineShaderStageCreateFlagBits::eRequireFullSubgroupsEXT;
829
+ }
830
+
816
831
vk::PipelineShaderStageCreateInfo pipeline_shader_create_info (
817
- vk::PipelineShaderStageCreateFlags () ,
832
+ pipeline_shader_stage_create_flags ,
818
833
vk::ShaderStageFlagBits::eCompute,
819
834
pipeline->shader_module ,
820
835
entrypoint.c_str (),
821
836
&specialization_info);
837
+
838
+ vk::PipelineShaderStageRequiredSubgroupSizeCreateInfoEXT pipeline_shader_stage_required_subgroup_size_create_info;
839
+ pipeline_shader_stage_required_subgroup_size_create_info.requiredSubgroupSize = required_subgroup_size;
840
+ if (device->subgroup_size_control && required_subgroup_size > 0 ) {
841
+ GGML_ASSERT (device->subgroup_min_size <= required_subgroup_size && required_subgroup_size <= device->subgroup_max_size );
842
+ pipeline_shader_create_info.setPNext (&pipeline_shader_stage_required_subgroup_size_create_info);
843
+ }
844
+
822
845
vk::ComputePipelineCreateInfo compute_pipeline_create_info (
823
- vk::PipelineCreateFlags () ,
846
+ vk::PipelineCreateFlags{} ,
824
847
pipeline_shader_create_info,
825
848
pipeline->layout );
826
849
@@ -1500,7 +1523,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
1500
1523
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1501
1524
1502
1525
std::vector<std::future<void >> compiles;
1503
- auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void * spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t , 3 > wg_denoms, const std::vector<uint32_t >& specialization_constants, uint32_t align, bool disable_robustness = false ) {
1526
+ auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void * spv_data, const std::string &entrypoint,
1527
+ uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t , 3 > wg_denoms, const std::vector<uint32_t >& specialization_constants,
1528
+ uint32_t align, bool disable_robustness = false , bool require_full_subgroups = false , uint32_t required_subgroup_size = 0 ) {
1504
1529
{
1505
1530
// wait until fewer than N compiles are in progress
1506
1531
uint32_t N = std::max (1u , std::thread::hardware_concurrency ());
@@ -1510,7 +1535,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
1510
1535
}
1511
1536
compile_count++;
1512
1537
}
1513
- compiles.push_back (std::async (ggml_vk_create_pipeline_func, std::ref (device), std::ref (pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness));
1538
+ compiles.push_back (std::async (ggml_vk_create_pipeline_func, std::ref (device), std::ref (pipeline), name, spv_size, spv_data, entrypoint,
1539
+ parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness, require_full_subgroups, required_subgroup_size));
1514
1540
};
1515
1541
1516
1542
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
@@ -1616,17 +1642,17 @@ static void ggml_vk_load_shaders(vk_device& device) {
1616
1642
// Create 6 variants, {s,m,l}x{unaligned,aligned}
1617
1643
#define CREATE_MM (PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID ) \
1618
1644
if (device->mul_mat ## ID ## _l) \
1619
- ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->l , #NAMELC #F16ACC " _l" , NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1 ); \
1645
+ ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->l , #NAMELC #F16ACC " _l" , NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1 , false , true ); \
1620
1646
if (device->mul_mat ## ID ## _m) \
1621
- ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->m , #NAMELC #F16ACC " _m" , NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1 ); \
1647
+ ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->m , #NAMELC #F16ACC " _m" , NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1 , false , true ); \
1622
1648
if (device->mul_mat ## ID ## _s) \
1623
- ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->s , #NAMELC #F16ACC " _s" , NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1 ); \
1649
+ ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->s , #NAMELC #F16ACC " _s" , NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1 , false , true ); \
1624
1650
if (device->mul_mat ## ID ## _l) \
1625
- ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->a_l , #NAMELC #F16ACC " _aligned_l" , NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \
1651
+ ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->a_l , #NAMELC #F16ACC " _aligned_l" , NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false , true ); \
1626
1652
if (device->mul_mat ## ID ## _m) \
1627
- ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->a_m , #NAMELC #F16ACC " _aligned_m" , NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \
1653
+ ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->a_m , #NAMELC #F16ACC " _aligned_m" , NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false , true ); \
1628
1654
if (device->mul_mat ## ID ## _s) \
1629
- ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->a_s , #NAMELC #F16ACC " _aligned_s" , NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \
1655
+ ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->a_s , #NAMELC #F16ACC " _aligned_s" , NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false , true ); \
1630
1656
1631
1657
// Create 2 variants, {f16,f32} accumulator
1632
1658
#define CREATE_MM2 (PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID ) \
@@ -1993,6 +2019,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
1993
2019
amd_shader_core_properties2 = true ;
1994
2020
} else if (strcmp (" VK_EXT_pipeline_robustness" , properties.extensionName ) == 0 ) {
1995
2021
pipeline_robustness = true ;
2022
+ } else if (strcmp (" VK_EXT_subgroup_size_control" , properties.extensionName ) == 0 ) {
2023
+ device->subgroup_size_control = true ;
1996
2024
} else if (strcmp (" VK_KHR_cooperative_matrix" , properties.extensionName ) == 0 &&
1997
2025
!getenv (" GGML_VK_DISABLE_COOPMAT" )) {
1998
2026
device->coopmat_support = true ;
@@ -2012,6 +2040,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
2012
2040
vk::PhysicalDeviceDriverProperties driver_props;
2013
2041
vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props;
2014
2042
vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props;
2043
+ vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
2044
+
2015
2045
props2.pNext = &props3;
2016
2046
props3.pNext = &subgroup_props;
2017
2047
subgroup_props.pNext = &driver_props;
@@ -2030,6 +2060,10 @@ static vk_device ggml_vk_get_device(size_t idx) {
2030
2060
last_struct->pNext = (VkBaseOutStructure *)&amd_shader_core_properties2_props;
2031
2061
last_struct = (VkBaseOutStructure *)&amd_shader_core_properties2_props;
2032
2062
}
2063
+ if (device->subgroup_size_control ) {
2064
+ last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_props;
2065
+ last_struct = (VkBaseOutStructure *)&subgroup_size_control_props;
2066
+ }
2033
2067
2034
2068
#if defined(VK_NV_cooperative_matrix2)
2035
2069
vk::PhysicalDeviceCooperativeMatrix2PropertiesNV coopmat2_props;
@@ -2067,11 +2101,11 @@ static vk_device ggml_vk_get_device(size_t idx) {
2067
2101
2068
2102
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
2069
2103
2070
- if (device->vendor_id == VK_VENDOR_ID_INTEL || (props2. properties . vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) {
2071
- // Intel drivers don't support coopmat properly yet
2072
- // Only RADV supports coopmat properly on AMD
2073
- device->coopmat_support = false ;
2074
- }
2104
+ // if (device->vendor_id == VK_VENDOR_ID_INTEL || (device->vendor_id == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) {
2105
+ // // Intel drivers don't support coopmat properly yet
2106
+ // // Only RADV supports coopmat properly on AMD
2107
+ // device->coopmat_support = false;
2108
+ // }
2075
2109
2076
2110
std::vector<vk::QueueFamilyProperties> queue_family_props = device->physical_device .getQueueFamilyProperties ();
2077
2111
@@ -2123,6 +2157,17 @@ static vk_device ggml_vk_get_device(size_t idx) {
2123
2157
device_extensions.push_back (" VK_EXT_pipeline_robustness" );
2124
2158
}
2125
2159
2160
+ VkPhysicalDeviceSubgroupSizeControlFeaturesEXT subgroup_size_control_features;
2161
+ subgroup_size_control_features.pNext = nullptr ;
2162
+ subgroup_size_control_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT;
2163
+ subgroup_size_control_features.computeFullSubgroups = false ;
2164
+ subgroup_size_control_features.subgroupSizeControl = false ;
2165
+
2166
+ if (device->subgroup_size_control ) {
2167
+ last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_features;
2168
+ last_struct = (VkBaseOutStructure *)&subgroup_size_control_features;
2169
+ }
2170
+
2126
2171
VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
2127
2172
coopmat_features.pNext = nullptr ;
2128
2173
coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
@@ -2150,6 +2195,17 @@ static vk_device ggml_vk_get_device(size_t idx) {
2150
2195
2151
2196
device->pipeline_robustness = pl_robustness_features.pipelineRobustness ;
2152
2197
2198
+ device->subgroup_size_control = device->subgroup_size_control &&
2199
+ (!(subgroup_size_control_props.requiredSubgroupSizeStages & vk::ShaderStageFlagBits::eCompute) ||
2200
+ !subgroup_size_control_features.subgroupSizeControl );
2201
+
2202
+ if (device->subgroup_size_control ) {
2203
+ device->subgroup_min_size = subgroup_size_control_props.minSubgroupSize ;
2204
+ device->subgroup_max_size = subgroup_size_control_props.maxSubgroupSize ;
2205
+ device->subgroup_require_full_support = subgroup_size_control_features.computeFullSubgroups ;
2206
+ device_extensions.push_back (" VK_EXT_subgroup_size_control" );
2207
+ }
2208
+
2153
2209
device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix ;
2154
2210
2155
2211
if (coopmat2_support) {
@@ -2430,11 +2486,11 @@ static void ggml_vk_print_gpu_info(size_t idx) {
2430
2486
}
2431
2487
}
2432
2488
2433
- if (props2.properties .vendorID == VK_VENDOR_ID_INTEL || (props2.properties .vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) {
2434
- // Intel drivers don't support coopmat properly yet
2435
- // Only RADV supports coopmat properly on AMD
2436
- coopmat_support = false ;
2437
- }
2489
+ // if (props2.properties.vendorID == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) {
2490
+ // // Intel drivers don't support coopmat properly yet
2491
+ // // Only RADV supports coopmat properly on AMD
2492
+ // coopmat_support = false;
2493
+ // }
2438
2494
2439
2495
const char * GGML_VK_DISABLE_F16 = getenv (" GGML_VK_DISABLE_F16" );
2440
2496
bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr ;
0 commit comments