@@ -162,14 +162,19 @@ struct vk_device_struct {
162
162
uint32_t subgroup_size;
163
163
uint32_t shader_core_count;
164
164
bool uma;
165
- bool coopmat2;
165
+
166
+ bool subgroup_size_control;
167
+ uint32_t subgroup_min_size;
168
+ uint32_t subgroup_max_size;
169
+ bool subgroup_require_full_support;
166
170
167
171
bool coopmat_support;
168
172
bool coopmat_acc_f32_support;
169
173
bool coopmat_acc_f16_support;
170
174
uint32_t coopmat_m;
171
175
uint32_t coopmat_n;
172
176
uint32_t coopmat_k;
177
+ bool coopmat2;
173
178
174
179
size_t idx;
175
180
@@ -748,8 +753,12 @@ static uint32_t compile_count = 0;
748
753
static std::mutex compile_count_mutex;
749
754
static std::condition_variable compile_count_cond;
750
755
751
- static void ggml_vk_create_pipeline_func (vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void * spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t , 3 > wg_denoms, std::vector<uint32_t > specialization_constants, uint32_t align, bool disable_robustness) {
752
- VK_LOG_DEBUG (" ggml_vk_create_pipeline(" << device->name << " , " << name << " , " << entrypoint << " , " << parameter_count << " , " << push_constant_size << " , (" << wg_denoms[0 ] << " ," << wg_denoms[1 ] << " ," << wg_denoms[2 ] << " ), specialization_constants, " << align << " )" );
756
+ static void ggml_vk_create_pipeline_func (vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void * spv_data, const std::string entrypoint,
757
+ uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t , 3 > wg_denoms, std::vector<uint32_t > specialization_constants,
758
+ uint32_t align, bool disable_robustness, bool require_full_subgroups, uint32_t required_subgroup_size) {
759
+ VK_LOG_DEBUG (" ggml_vk_create_pipeline(" << device->name << " , " << name << " , " << entrypoint << " , " << parameter_count << " , " << push_constant_size <<
760
+ " , (" << wg_denoms[0 ] << " ," << wg_denoms[1 ] << " ," << wg_denoms[2 ] << " ), specialization_constants, " << align <<
761
+ " , " << disable_robustness << " , " << require_full_subgroups << " , " << required_subgroup_size << " )" );
753
762
GGML_ASSERT (parameter_count > 0 );
754
763
GGML_ASSERT (wg_denoms[0 ] > 0 && wg_denoms[1 ] > 0 && wg_denoms[2 ] > 0 ); // NOLINT
755
764
@@ -808,14 +817,28 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
808
817
specialization_constants.data ()
809
818
);
810
819
820
+ vk::PipelineShaderStageCreateFlags pipeline_shader_stage_create_flags{};
821
+
822
+ if (device->subgroup_require_full_support && require_full_subgroups) {
823
+ pipeline_shader_stage_create_flags |= vk::PipelineShaderStageCreateFlagBits::eRequireFullSubgroupsEXT;
824
+ }
825
+
811
826
vk::PipelineShaderStageCreateInfo pipeline_shader_create_info (
812
- vk::PipelineShaderStageCreateFlags () ,
827
+ pipeline_shader_stage_create_flags ,
813
828
vk::ShaderStageFlagBits::eCompute,
814
829
pipeline->shader_module ,
815
830
entrypoint.c_str (),
816
831
&specialization_info);
832
+
833
+ vk::PipelineShaderStageRequiredSubgroupSizeCreateInfoEXT pipeline_shader_stage_required_subgroup_size_create_info;
834
+ pipeline_shader_stage_required_subgroup_size_create_info.requiredSubgroupSize = required_subgroup_size;
835
+ if (device->subgroup_size_control && required_subgroup_size > 0 ) {
836
+ GGML_ASSERT (device->subgroup_min_size <= required_subgroup_size && required_subgroup_size <= device->subgroup_max_size );
837
+ pipeline_shader_create_info.setPNext (&pipeline_shader_stage_required_subgroup_size_create_info);
838
+ }
839
+
817
840
vk::ComputePipelineCreateInfo compute_pipeline_create_info (
818
- vk::PipelineCreateFlags () ,
841
+ vk::PipelineCreateFlags{} ,
819
842
pipeline_shader_create_info,
820
843
pipeline->layout );
821
844
@@ -1495,7 +1518,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
1495
1518
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1496
1519
1497
1520
std::vector<std::future<void >> compiles;
1498
- auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void * spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t , 3 > wg_denoms, const std::vector<uint32_t >& specialization_constants, uint32_t align, bool disable_robustness = false ) {
1521
+ auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void * spv_data, const std::string &entrypoint,
1522
+ uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t , 3 > wg_denoms, const std::vector<uint32_t >& specialization_constants,
1523
+ uint32_t align, bool disable_robustness = false , bool require_full_subgroups = false , uint32_t required_subgroup_size = 0 ) {
1499
1524
{
1500
1525
// wait until fewer than N compiles are in progress
1501
1526
uint32_t N = std::max (1u , std::thread::hardware_concurrency ());
@@ -1505,7 +1530,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
1505
1530
}
1506
1531
compile_count++;
1507
1532
}
1508
- compiles.push_back (std::async (ggml_vk_create_pipeline_func, std::ref (device), std::ref (pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness));
1533
+ compiles.push_back (std::async (ggml_vk_create_pipeline_func, std::ref (device), std::ref (pipeline), name, spv_size, spv_data, entrypoint,
1534
+ parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness, require_full_subgroups, required_subgroup_size));
1509
1535
};
1510
1536
1511
1537
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
@@ -1611,17 +1637,17 @@ static void ggml_vk_load_shaders(vk_device& device) {
1611
1637
// Create 6 variants, {s,m,l}x{unaligned,aligned}
1612
1638
#define CREATE_MM (PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID ) \
1613
1639
if (device->mul_mat ## ID ## _l) \
1614
- ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->l , #NAMELC #F16ACC " _l" , NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1 ); \
1640
+ ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->l , #NAMELC #F16ACC " _l" , NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1 , false , true ); \
1615
1641
if (device->mul_mat ## ID ## _m) \
1616
- ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->m , #NAMELC #F16ACC " _m" , NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1 ); \
1642
+ ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->m , #NAMELC #F16ACC " _m" , NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1 , false , true ); \
1617
1643
if (device->mul_mat ## ID ## _s) \
1618
- ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->s , #NAMELC #F16ACC " _s" , NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1 ); \
1644
+ ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->s , #NAMELC #F16ACC " _s" , NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1 , false , true ); \
1619
1645
if (device->mul_mat ## ID ## _l) \
1620
- ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->a_l , #NAMELC #F16ACC " _aligned_l" , NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \
1646
+ ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->a_l , #NAMELC #F16ACC " _aligned_l" , NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false , true ); \
1621
1647
if (device->mul_mat ## ID ## _m) \
1622
- ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->a_m , #NAMELC #F16ACC " _aligned_m" , NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \
1648
+ ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->a_m , #NAMELC #F16ACC " _aligned_m" , NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false , true ); \
1623
1649
if (device->mul_mat ## ID ## _s) \
1624
- ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->a_s , #NAMELC #F16ACC " _aligned_s" , NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \
1650
+ ggml_vk_create_pipeline (device, device-> PIPELINE_NAME ->a_s , #NAMELC #F16ACC " _aligned_s" , NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, " main" , PARAMCOUNT, sizeof (PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false , true ); \
1625
1651
1626
1652
// Create 2 variants, {f16,f32} accumulator
1627
1653
#define CREATE_MM2 (PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID ) \
@@ -1988,6 +2014,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
1988
2014
amd_shader_core_properties2 = true ;
1989
2015
} else if (strcmp (" VK_EXT_pipeline_robustness" , properties.extensionName ) == 0 ) {
1990
2016
pipeline_robustness = true ;
2017
+ } else if (strcmp (" VK_EXT_subgroup_size_control" , properties.extensionName ) == 0 ) {
2018
+ device->subgroup_size_control = true ;
1991
2019
} else if (strcmp (" VK_KHR_cooperative_matrix" , properties.extensionName ) == 0 &&
1992
2020
!getenv (" GGML_VK_DISABLE_COOPMAT" )) {
1993
2021
device->coopmat_support = true ;
@@ -2007,6 +2035,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
2007
2035
vk::PhysicalDeviceDriverProperties driver_props;
2008
2036
vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props;
2009
2037
vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props;
2038
+ vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
2039
+
2010
2040
props2.pNext = &props3;
2011
2041
props3.pNext = &subgroup_props;
2012
2042
subgroup_props.pNext = &driver_props;
@@ -2025,6 +2055,10 @@ static vk_device ggml_vk_get_device(size_t idx) {
2025
2055
last_struct->pNext = (VkBaseOutStructure *)&amd_shader_core_properties2_props;
2026
2056
last_struct = (VkBaseOutStructure *)&amd_shader_core_properties2_props;
2027
2057
}
2058
+ if (device->subgroup_size_control ) {
2059
+ last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_props;
2060
+ last_struct = (VkBaseOutStructure *)&subgroup_size_control_props;
2061
+ }
2028
2062
2029
2063
#if defined(VK_NV_cooperative_matrix2)
2030
2064
vk::PhysicalDeviceCooperativeMatrix2PropertiesNV coopmat2_props;
@@ -2062,11 +2096,11 @@ static vk_device ggml_vk_get_device(size_t idx) {
2062
2096
2063
2097
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
2064
2098
2065
- if (device->vendor_id == VK_VENDOR_ID_INTEL || (props2. properties . vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) {
2066
- // Intel drivers don't support coopmat properly yet
2067
- // Only RADV supports coopmat properly on AMD
2068
- device->coopmat_support = false ;
2069
- }
2099
+ // if (device->vendor_id == VK_VENDOR_ID_INTEL || (device->vendor_id == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) {
2100
+ // // Intel drivers don't support coopmat properly yet
2101
+ // // Only RADV supports coopmat properly on AMD
2102
+ // device->coopmat_support = false;
2103
+ // }
2070
2104
2071
2105
std::vector<vk::QueueFamilyProperties> queue_family_props = device->physical_device .getQueueFamilyProperties ();
2072
2106
@@ -2118,6 +2152,17 @@ static vk_device ggml_vk_get_device(size_t idx) {
2118
2152
device_extensions.push_back (" VK_EXT_pipeline_robustness" );
2119
2153
}
2120
2154
2155
+ VkPhysicalDeviceSubgroupSizeControlFeaturesEXT subgroup_size_control_features;
2156
+ subgroup_size_control_features.pNext = nullptr ;
2157
+ subgroup_size_control_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT;
2158
+ subgroup_size_control_features.computeFullSubgroups = false ;
2159
+ subgroup_size_control_features.subgroupSizeControl = false ;
2160
+
2161
+ if (device->subgroup_size_control ) {
2162
+ last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_features;
2163
+ last_struct = (VkBaseOutStructure *)&subgroup_size_control_features;
2164
+ }
2165
+
2121
2166
VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
2122
2167
coopmat_features.pNext = nullptr ;
2123
2168
coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
@@ -2145,6 +2190,17 @@ static vk_device ggml_vk_get_device(size_t idx) {
2145
2190
2146
2191
device->pipeline_robustness = pl_robustness_features.pipelineRobustness ;
2147
2192
2193
+ device->subgroup_size_control = device->subgroup_size_control &&
2194
+ (!(subgroup_size_control_props.requiredSubgroupSizeStages & vk::ShaderStageFlagBits::eCompute) ||
2195
+ !subgroup_size_control_features.subgroupSizeControl );
2196
+
2197
+ if (device->subgroup_size_control ) {
2198
+ device->subgroup_min_size = subgroup_size_control_props.minSubgroupSize ;
2199
+ device->subgroup_max_size = subgroup_size_control_props.maxSubgroupSize ;
2200
+ device->subgroup_require_full_support = subgroup_size_control_features.computeFullSubgroups ;
2201
+ device_extensions.push_back (" VK_EXT_subgroup_size_control" );
2202
+ }
2203
+
2148
2204
device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix ;
2149
2205
2150
2206
if (coopmat2_support) {
@@ -2427,11 +2483,11 @@ static void ggml_vk_print_gpu_info(size_t idx) {
2427
2483
}
2428
2484
}
2429
2485
2430
- if (props2.properties .vendorID == VK_VENDOR_ID_INTEL || (props2.properties .vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) {
2431
- // Intel drivers don't support coopmat properly yet
2432
- // Only RADV supports coopmat properly on AMD
2433
- coopmat_support = false ;
2434
- }
2486
+ // if (props2.properties.vendorID == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) {
2487
+ // // Intel drivers don't support coopmat properly yet
2488
+ // // Only RADV supports coopmat properly on AMD
2489
+ // coopmat_support = false;
2490
+ // }
2435
2491
2436
2492
const char * GGML_VK_DISABLE_F16 = getenv (" GGML_VK_DISABLE_F16" );
2437
2493
bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr ;
0 commit comments