@@ -1725,7 +1725,7 @@ uint __builtin_spirv_OpGroupNonUniformBallotFindMSB_i32_v4i32(uint Execution, ui
1725
1725
{
1726
1726
if (Execution == Subgroup )
1727
1727
{
1728
- return ( sizeof ( uint ) * 8 ) - __builtin_spirv_OpenCL_clz_i32 (Value .x );
1728
+ return __builtin_spirv_OpenCL_clz_i32 (Value .x );
1729
1729
}
1730
1730
return 0 ;
1731
1731
}
@@ -2083,117 +2083,6 @@ DEFN_UNIFORM_GROUP_FUNC(SMax, int, i32, __builtin_spirv_OpenCL_s_max_i32_i32,
2083
2083
DEFN_UNIFORM_GROUP_FUNC (SMax , long , i64 , __builtin_spirv_OpenCL_s_max_i64_i64 , LONG_MIN )
2084
2084
2085
2085
#if defined(cl_khr_subgroup_non_uniform_arithmetic ) || defined(cl_khr_subgroup_clustered_reduce )
2086
- #define DEFN_SUB_GROUP_REDUCE_NON_UNIFORM (type , type_abbr , op , identity , X ) \
2087
- { \
2088
- uint activeChannels = __builtin_IB_WaveBallot(true); \
2089
- uint firstActive = __builtin_spirv_OpenCL_ctz_i32(activeChannels); \
2090
- \
2091
- type result = identity; \
2092
- while (activeChannels) \
2093
- { \
2094
- uint activeId = __builtin_spirv_OpenCL_ctz_i32(activeChannels); \
2095
- \
2096
- type value = intel_sub_group_shuffle(X, activeId); \
2097
- result = op(value, result); \
2098
- \
2099
- uint disable = 1 << activeId; \
2100
- activeChannels ^= disable; \
2101
- } \
2102
- \
2103
- uint3 vec3; \
2104
- vec3.s0 = firstActive; \
2105
- X = __builtin_spirv_OpGroupBroadcast_i32_##type_abbr##_v3i32(Subgroup, result, vec3); \
2106
- }
2107
-
2108
- #define DEFN_SUB_GROUP_SCAN_INCL_NON_UNIFORM (type , type_abbr , op , identity , X ) \
2109
- { \
2110
- uint sglid = __builtin_spirv_BuiltInSubgroupLocalInvocationId(); \
2111
- uint activeChannels = __builtin_IB_WaveBallot(true); \
2112
- \
2113
- while (activeChannels) \
2114
- { \
2115
- uint activeId = __builtin_spirv_OpenCL_ctz_i32(activeChannels); \
2116
- \
2117
- type value = intel_sub_group_shuffle(X, activeId); \
2118
- if (sglid > activeId) \
2119
- X = op(value, X); \
2120
- \
2121
- uint disable = 1 << activeId; \
2122
- activeChannels ^= disable; \
2123
- } \
2124
- }
2125
-
2126
- #define DEFN_SUB_GROUP_SCAN_EXCL_NON_UNIFORM (type , type_abbr , op , identity , X ) \
2127
- { \
2128
- uint sglid = __builtin_spirv_BuiltInSubgroupLocalInvocationId(); \
2129
- uint activeChannels = __builtin_IB_WaveBallot(true); \
2130
- \
2131
- uint mask = (1 << sglid) - 1; \
2132
- uint sglidPrev = (sizeof(uint) * 8 - __builtin_spirv_OpenCL_clz_i32(activeChannels & mask)) - 1; \
2133
- uint offsetToPrevActive = sglid - sglidPrev; \
2134
- X = intel_sub_group_shuffle_up((type)identity, X, offsetToPrevActive); \
2135
- \
2136
- while (activeChannels) \
2137
- { \
2138
- uint activeId = __builtin_spirv_OpenCL_ctz_i32(activeChannels); \
2139
- \
2140
- type value = intel_sub_group_shuffle(X, activeId); \
2141
- if (sglid > activeId) \
2142
- X = op(value, X); \
2143
- \
2144
- uint disable = 1 << activeId; \
2145
- activeChannels ^= disable; \
2146
- } \
2147
- }
2148
-
2149
- #define DEFN_SUB_GROUP_CLUSTERED_REDUCE (type , type_abbr , op , identity , X , ClusterSize ) \
2150
- { \
2151
- uint clusterIndex = 0; \
2152
- uint activeChannels = __builtin_IB_WaveBallot(true); \
2153
- uint numActive = __builtin_spirv_OpenCL_popcount_i32(activeChannels); \
2154
- uint numClusters = numActive / ClusterSize; \
2155
- \
2156
- for (uint clusterIndex = 0; clusterIndex < numClusters; clusterIndex++) \
2157
- { \
2158
- uint Counter = ClusterSize; \
2159
- uint Ballot = activeChannels; \
2160
- uint clusterBallot = 0; \
2161
- while (Counter--) \
2162
- { \
2163
- uint trailingOne = 1 << __builtin_spirv_OpenCL_ctz_i32(Ballot); \
2164
- clusterBallot |= trailingOne; \
2165
- Ballot ^= trailingOne; \
2166
- } \
2167
- uint active = __builtin_spirv_OpGroupNonUniformInverseBallot_i32_v4i32(Subgroup, clusterBallot); \
2168
- if (active) \
2169
- { \
2170
- DEFN_SUB_GROUP_REDUCE_NON_UNIFORM(type, type_abbr, op, identity, X) \
2171
- } \
2172
- activeChannels ^= clusterBallot; \
2173
- } \
2174
- }
2175
-
2176
- #define SUB_GROUP_SWITCH_NON_UNIFORM (type , type_abbr , op , identity , X , Operation , ClusterSize ) \
2177
- { \
2178
- switch (Operation){ \
2179
- case GroupOperationReduce: \
2180
- DEFN_SUB_GROUP_REDUCE_NON_UNIFORM(type, type_abbr, op, identity, X) \
2181
- break; \
2182
- case GroupOperationInclusiveScan: \
2183
- DEFN_SUB_GROUP_SCAN_INCL_NON_UNIFORM(type, type_abbr, op, identity, X) \
2184
- break; \
2185
- case GroupOperationExclusiveScan: \
2186
- DEFN_SUB_GROUP_SCAN_EXCL_NON_UNIFORM(type, type_abbr, op, identity, X) \
2187
- break; \
2188
- case GroupOperationClusteredReduce: \
2189
- DEFN_SUB_GROUP_CLUSTERED_REDUCE(type, type_abbr, op, identity, X, ClusterSize) \
2190
- break; \
2191
- default: \
2192
- return 0; \
2193
- break; \
2194
- } \
2195
- }
2196
-
2197
2086
// ClusterSize is an optional parameter
2198
2087
#define DEFN_NON_UNIFORM_GROUP_FUNC (func , type , type_abbr , op , identity ) \
2199
2088
type __builtin_spirv_OpGroupNonUniform##func##_i32_i32_##type_abbr##_i32(uint Execution, uint Operation, type X, uint ClusterSize) \
@@ -2220,8 +2109,7 @@ type __builtin_spirv_OpGroupNonUniform##func##_i32_i32_##type_abbr##_i32(uint E
2220
2109
} \
2221
2110
} \
2222
2111
else { \
2223
- SUB_GROUP_SWITCH_NON_UNIFORM(type, type_abbr, op, identity, X, Operation, ClusterSize) \
2224
- return X; \
2112
+ SUB_GROUP_SWITCH(type, type_abbr, op, identity, X, Operation) \
2225
2113
} \
2226
2114
return 0; \
2227
2115
} \
@@ -2283,36 +2171,31 @@ DEFN_NON_UNIFORM_GROUP_FUNC(FMax, half, f16, __builtin_spirv_OpenCL_fmax_f16_f
2283
2171
#endif // defined(cl_khr_fp16)
2284
2172
2285
2173
// OpGroupNonUniformIMul, OpGroupNonUniformFMul
2286
- DEFN_NON_UNIFORM_GROUP_FUNC (IMul , uchar , i8 , __intel_mul , 1 )
2287
- DEFN_NON_UNIFORM_GROUP_FUNC (IMul , ushort , i16 , __intel_mul , 1 )
2288
- DEFN_NON_UNIFORM_GROUP_FUNC (IMul , uint , i32 , __intel_mul , 1 )
2289
- DEFN_NON_UNIFORM_GROUP_FUNC (IMul , ulong , i64 , __intel_mul , 1 )
2290
- DEFN_NON_UNIFORM_GROUP_FUNC (FMul , float , f32 , __intel_mul , 1 )
2174
+ DEFN_NON_UNIFORM_GROUP_FUNC (IMul , uchar , i8 , __intel_mul , 0 )
2175
+ DEFN_NON_UNIFORM_GROUP_FUNC (IMul , ushort , i16 , __intel_mul , 0 )
2176
+ DEFN_NON_UNIFORM_GROUP_FUNC (IMul , uint , i32 , __intel_mul , 0 )
2177
+ DEFN_NON_UNIFORM_GROUP_FUNC (IMul , ulong , i64 , __intel_mul , 0 )
2178
+ DEFN_NON_UNIFORM_GROUP_FUNC (FMul , float , f32 , __intel_mul , 0 )
2291
2179
#if defined(cl_khr_fp64 )
2292
- DEFN_NON_UNIFORM_GROUP_FUNC (FMul , double , f64 , __intel_mul , 1 )
2180
+ DEFN_NON_UNIFORM_GROUP_FUNC (FMul , double , f64 , __intel_mul , 0 )
2293
2181
#endif // defined(cl_khr_fp64)
2294
2182
#if defined(cl_khr_fp16 )
2295
- DEFN_NON_UNIFORM_GROUP_FUNC (FMul , half , f16 , __intel_mul , 1 )
2183
+ DEFN_NON_UNIFORM_GROUP_FUNC (FMul , half , f16 , __intel_mul , 0 )
2296
2184
#endif // defined(cl_khr_fp16)
2297
2185
2298
2186
// OpGroupNonUniformBitwiseAnd, OpGroupNonUniformBitwiseOr, OpGroupNonUniformBitwiseXor
2299
- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseAnd , uchar , i8 , __intel_and , 0xFF )
2300
- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseAnd , ushort , i16 , __intel_and , 0xFFFF )
2301
- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseAnd , uint , i32 , __intel_and , 0xFFFFFFFF )
2302
- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseAnd , ulong , i64 , __intel_and , 0xFFFFFFFFFFFFFFFF )
2303
-
2304
- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseOr , uchar , i8 , __intel_or , 0 )
2305
- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseOr , ushort , i16 , __intel_or , 0 )
2306
- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseOr , uint , i32 , __intel_or , 0 )
2307
- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseOr , ulong , i64 , __intel_or , 0 )
2187
+ #define DEFN_NON_UNIFORM_BITWISE_OPERATION (func , op ) \
2188
+ DEFN_NON_UNIFORM_GROUP_FUNC(func, uchar, i8, __intel_##op, 0) \
2189
+ DEFN_NON_UNIFORM_GROUP_FUNC(func, ushort, i16, __intel_##op, 0) \
2190
+ DEFN_NON_UNIFORM_GROUP_FUNC(func, uint, i32, __intel_##op, 0) \
2191
+ DEFN_NON_UNIFORM_GROUP_FUNC(func, ulong, i64, __intel_##op, 0)
2308
2192
2309
- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseXor , uchar , i8 , __intel_xor , 0 )
2310
- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseXor , ushort , i16 , __intel_xor , 0 )
2311
- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseXor , uint , i32 , __intel_xor , 0 )
2312
- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseXor , ulong , i64 , __intel_xor , 0 )
2193
+ DEFN_NON_UNIFORM_BITWISE_OPERATION (BitwiseAnd , and )
2194
+ DEFN_NON_UNIFORM_BITWISE_OPERATION (BitwiseOr , or )
2195
+ DEFN_NON_UNIFORM_BITWISE_OPERATION (BitwiseXor , xor )
2313
2196
2314
2197
// OpGroupNonUniformLogicalAnd, OpGroupNonUniformLogicalOr, OpGroupNonUniformLogicalXor
2315
- DEFN_NON_UNIFORM_GROUP_FUNC (LogicalAnd , bool , i1 , __intel_and , 1 )
2198
+ DEFN_NON_UNIFORM_GROUP_FUNC (LogicalAnd , bool , i1 , __intel_and , 0 )
2316
2199
DEFN_NON_UNIFORM_GROUP_FUNC (LogicalOr , bool , i1 , __intel_or , 0 )
2317
2200
DEFN_NON_UNIFORM_GROUP_FUNC (LogicalXor , bool , i1 , __intel_xor , 0 )
2318
2201
#endif // defined(cl_khr_subgroup_non_uniform_arithmetic) || defined(cl_khr_subgroup_clustered_reduce)
0 commit comments