@@ -1725,7 +1725,7 @@ uint __builtin_spirv_OpGroupNonUniformBallotFindMSB_i32_v4i32(uint Execution, ui
1725
1725
{
1726
1726
if (Execution == Subgroup )
1727
1727
{
1728
- return __builtin_spirv_OpenCL_clz_i32 (Value .x );
1728
+ return ( sizeof ( uint ) * 8 ) - __builtin_spirv_OpenCL_clz_i32 (Value .x );
1729
1729
}
1730
1730
return 0 ;
1731
1731
}
@@ -2083,6 +2083,117 @@ DEFN_UNIFORM_GROUP_FUNC(SMax, int, i32, __builtin_spirv_OpenCL_s_max_i32_i32,
2083
2083
DEFN_UNIFORM_GROUP_FUNC (SMax , long , i64 , __builtin_spirv_OpenCL_s_max_i64_i64 , LONG_MIN )
2084
2084
2085
2085
#if defined(cl_khr_subgroup_non_uniform_arithmetic ) || defined(cl_khr_subgroup_clustered_reduce )
2086
+ #define DEFN_SUB_GROUP_REDUCE_NON_UNIFORM (type , type_abbr , op , identity , X ) \
2087
+ { \
2088
+ uint activeChannels = __builtin_IB_WaveBallot(true); \
2089
+ uint firstActive = __builtin_spirv_OpenCL_ctz_i32(activeChannels); \
2090
+ \
2091
+ type result = identity; \
2092
+ while (activeChannels) \
2093
+ { \
2094
+ uint activeId = __builtin_spirv_OpenCL_ctz_i32(activeChannels); \
2095
+ \
2096
+ type value = intel_sub_group_shuffle(X, activeId); \
2097
+ result = op(value, result); \
2098
+ \
2099
+ uint disable = 1 << activeId; \
2100
+ activeChannels ^= disable; \
2101
+ } \
2102
+ \
2103
+ uint3 vec3; \
2104
+ vec3.s0 = firstActive; \
2105
+ X = __builtin_spirv_OpGroupBroadcast_i32_##type_abbr##_v3i32(Subgroup, result, vec3); \
2106
+ }
2107
+
2108
+ #define DEFN_SUB_GROUP_SCAN_INCL_NON_UNIFORM (type , type_abbr , op , identity , X ) \
2109
+ { \
2110
+ uint sglid = __builtin_spirv_BuiltInSubgroupLocalInvocationId(); \
2111
+ uint activeChannels = __builtin_IB_WaveBallot(true); \
2112
+ \
2113
+ while (activeChannels) \
2114
+ { \
2115
+ uint activeId = __builtin_spirv_OpenCL_ctz_i32(activeChannels); \
2116
+ \
2117
+ type value = intel_sub_group_shuffle(X, activeId); \
2118
+ if (sglid > activeId) \
2119
+ X = op(value, X); \
2120
+ \
2121
+ uint disable = 1 << activeId; \
2122
+ activeChannels ^= disable; \
2123
+ } \
2124
+ }
2125
+
2126
+ #define DEFN_SUB_GROUP_SCAN_EXCL_NON_UNIFORM (type , type_abbr , op , identity , X ) \
2127
+ { \
2128
+ uint sglid = __builtin_spirv_BuiltInSubgroupLocalInvocationId(); \
2129
+ uint activeChannels = __builtin_IB_WaveBallot(true); \
2130
+ \
2131
+ uint mask = (1 << sglid) - 1; \
2132
+ uint sglidPrev = (sizeof(uint) * 8 - __builtin_spirv_OpenCL_clz_i32(activeChannels & mask)) - 1; \
2133
+ uint offsetToPrevActive = sglid - sglidPrev; \
2134
+ X = intel_sub_group_shuffle_up((type)identity, X, offsetToPrevActive); \
2135
+ \
2136
+ while (activeChannels) \
2137
+ { \
2138
+ uint activeId = __builtin_spirv_OpenCL_ctz_i32(activeChannels); \
2139
+ \
2140
+ type value = intel_sub_group_shuffle(X, activeId); \
2141
+ if (sglid > activeId) \
2142
+ X = op(value, X); \
2143
+ \
2144
+ uint disable = 1 << activeId; \
2145
+ activeChannels ^= disable; \
2146
+ } \
2147
+ }
2148
+
2149
+ #define DEFN_SUB_GROUP_CLUSTERED_REDUCE (type , type_abbr , op , identity , X , ClusterSize ) \
2150
+ { \
2151
+ uint clusterIndex = 0; \
2152
+ uint activeChannels = __builtin_IB_WaveBallot(true); \
2153
+ uint numActive = __builtin_spirv_OpenCL_popcount_i32(activeChannels); \
2154
+ uint numClusters = numActive / ClusterSize; \
2155
+ \
2156
+ for (uint clusterIndex = 0; clusterIndex < numClusters; clusterIndex++) \
2157
+ { \
2158
+ uint Counter = ClusterSize; \
2159
+ uint Ballot = activeChannels; \
2160
+ uint clusterBallot = 0; \
2161
+ while (Counter--) \
2162
+ { \
2163
+ uint trailingOne = 1 << __builtin_spirv_OpenCL_ctz_i32(Ballot); \
2164
+ clusterBallot |= trailingOne; \
2165
+ Ballot ^= trailingOne; \
2166
+ } \
2167
+ uint active = __builtin_spirv_OpGroupNonUniformInverseBallot_i32_v4i32(Subgroup, clusterBallot); \
2168
+ if (active) \
2169
+ { \
2170
+ DEFN_SUB_GROUP_REDUCE_NON_UNIFORM(type, type_abbr, op, identity, X) \
2171
+ } \
2172
+ activeChannels ^= clusterBallot; \
2173
+ } \
2174
+ }
2175
+
2176
+ #define SUB_GROUP_SWITCH_NON_UNIFORM (type , type_abbr , op , identity , X , Operation , ClusterSize ) \
2177
+ { \
2178
+ switch (Operation){ \
2179
+ case GroupOperationReduce: \
2180
+ DEFN_SUB_GROUP_REDUCE_NON_UNIFORM(type, type_abbr, op, identity, X) \
2181
+ break; \
2182
+ case GroupOperationInclusiveScan: \
2183
+ DEFN_SUB_GROUP_SCAN_INCL_NON_UNIFORM(type, type_abbr, op, identity, X) \
2184
+ break; \
2185
+ case GroupOperationExclusiveScan: \
2186
+ DEFN_SUB_GROUP_SCAN_EXCL_NON_UNIFORM(type, type_abbr, op, identity, X) \
2187
+ break; \
2188
+ case GroupOperationClusteredReduce: \
2189
+ DEFN_SUB_GROUP_CLUSTERED_REDUCE(type, type_abbr, op, identity, X, ClusterSize) \
2190
+ break; \
2191
+ default: \
2192
+ return 0; \
2193
+ break; \
2194
+ } \
2195
+ }
2196
+
2086
2197
// ClusterSize is an optional parameter
2087
2198
#define DEFN_NON_UNIFORM_GROUP_FUNC (func , type , type_abbr , op , identity ) \
2088
2199
type __builtin_spirv_OpGroupNonUniform##func##_i32_i32_##type_abbr##_i32(uint Execution, uint Operation, type X, uint ClusterSize) \
@@ -2109,7 +2220,8 @@ type __builtin_spirv_OpGroupNonUniform##func##_i32_i32_##type_abbr##_i32(uint E
2109
2220
} \
2110
2221
} \
2111
2222
else { \
2112
- SUB_GROUP_SWITCH(type, type_abbr, op, identity, X, Operation) \
2223
+ SUB_GROUP_SWITCH_NON_UNIFORM(type, type_abbr, op, identity, X, Operation, ClusterSize) \
2224
+ return X; \
2113
2225
} \
2114
2226
return 0; \
2115
2227
} \
@@ -2171,31 +2283,36 @@ DEFN_NON_UNIFORM_GROUP_FUNC(FMax, half, f16, __builtin_spirv_OpenCL_fmax_f16_f
2171
2283
#endif // defined(cl_khr_fp16)
2172
2284
2173
2285
// OpGroupNonUniformIMul, OpGroupNonUniformFMul
2174
- DEFN_NON_UNIFORM_GROUP_FUNC (IMul , uchar , i8 , __intel_mul , 0 )
2175
- DEFN_NON_UNIFORM_GROUP_FUNC (IMul , ushort , i16 , __intel_mul , 0 )
2176
- DEFN_NON_UNIFORM_GROUP_FUNC (IMul , uint , i32 , __intel_mul , 0 )
2177
- DEFN_NON_UNIFORM_GROUP_FUNC (IMul , ulong , i64 , __intel_mul , 0 )
2178
- DEFN_NON_UNIFORM_GROUP_FUNC (FMul , float , f32 , __intel_mul , 0 )
2286
+ DEFN_NON_UNIFORM_GROUP_FUNC (IMul , uchar , i8 , __intel_mul , 1 )
2287
+ DEFN_NON_UNIFORM_GROUP_FUNC (IMul , ushort , i16 , __intel_mul , 1 )
2288
+ DEFN_NON_UNIFORM_GROUP_FUNC (IMul , uint , i32 , __intel_mul , 1 )
2289
+ DEFN_NON_UNIFORM_GROUP_FUNC (IMul , ulong , i64 , __intel_mul , 1 )
2290
+ DEFN_NON_UNIFORM_GROUP_FUNC (FMul , float , f32 , __intel_mul , 1 )
2179
2291
#if defined(cl_khr_fp64 )
2180
- DEFN_NON_UNIFORM_GROUP_FUNC (FMul , double , f64 , __intel_mul , 0 )
2292
+ DEFN_NON_UNIFORM_GROUP_FUNC (FMul , double , f64 , __intel_mul , 1 )
2181
2293
#endif // defined(cl_khr_fp64)
2182
2294
#if defined(cl_khr_fp16 )
2183
- DEFN_NON_UNIFORM_GROUP_FUNC (FMul , half , f16 , __intel_mul , 0 )
2295
+ DEFN_NON_UNIFORM_GROUP_FUNC (FMul , half , f16 , __intel_mul , 1 )
2184
2296
#endif // defined(cl_khr_fp16)
2185
2297
2186
2298
// OpGroupNonUniformBitwiseAnd, OpGroupNonUniformBitwiseOr, OpGroupNonUniformBitwiseXor
2187
- #define DEFN_NON_UNIFORM_BITWISE_OPERATION (func , op ) \
2188
- DEFN_NON_UNIFORM_GROUP_FUNC(func, uchar, i8, __intel_##op, 0) \
2189
- DEFN_NON_UNIFORM_GROUP_FUNC(func, ushort, i16, __intel_##op, 0) \
2190
- DEFN_NON_UNIFORM_GROUP_FUNC(func, uint, i32, __intel_##op, 0) \
2191
- DEFN_NON_UNIFORM_GROUP_FUNC(func, ulong, i64, __intel_##op, 0)
2299
+ DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseAnd , uchar , i8 , __intel_and , 0xFF )
2300
+ DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseAnd , ushort , i16 , __intel_and , 0xFFFF )
2301
+ DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseAnd , uint , i32 , __intel_and , 0xFFFFFFFF )
2302
+ DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseAnd , ulong , i64 , __intel_and , 0xFFFFFFFFFFFFFFFF )
2303
+
2304
+ DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseOr , uchar , i8 , __intel_or , 0 )
2305
+ DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseOr , ushort , i16 , __intel_or , 0 )
2306
+ DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseOr , uint , i32 , __intel_or , 0 )
2307
+ DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseOr , ulong , i64 , __intel_or , 0 )
2192
2308
2193
- DEFN_NON_UNIFORM_BITWISE_OPERATION (BitwiseAnd , and )
2194
- DEFN_NON_UNIFORM_BITWISE_OPERATION (BitwiseOr , or )
2195
- DEFN_NON_UNIFORM_BITWISE_OPERATION (BitwiseXor , xor )
2309
+ DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseXor , uchar , i8 , __intel_xor , 0 )
2310
+ DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseXor , ushort , i16 , __intel_xor , 0 )
2311
+ DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseXor , uint , i32 , __intel_xor , 0 )
2312
+ DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseXor , ulong , i64 , __intel_xor , 0 )
2196
2313
2197
2314
// OpGroupNonUniformLogicalAnd, OpGroupNonUniformLogicalOr, OpGroupNonUniformLogicalXor
2198
- DEFN_NON_UNIFORM_GROUP_FUNC (LogicalAnd , bool , i1 , __intel_and , 0 )
2315
+ DEFN_NON_UNIFORM_GROUP_FUNC (LogicalAnd , bool , i1 , __intel_and , 1 )
2199
2316
DEFN_NON_UNIFORM_GROUP_FUNC (LogicalOr , bool , i1 , __intel_or , 0 )
2200
2317
DEFN_NON_UNIFORM_GROUP_FUNC (LogicalXor , bool , i1 , __intel_xor , 0 )
2201
2318
#endif // defined(cl_khr_subgroup_non_uniform_arithmetic) || defined(cl_khr_subgroup_clustered_reduce)
0 commit comments