@@ -186,8 +186,160 @@ inline bool isnan(const sycl::ext::oneapi::bfloat16 a) {
186
186
}
187
187
#endif
188
188
189
+ // FIXME(syclcompat-lib-reviewers): move bfe outside detail once perf is
190
+ // improved & semantics understood
191
+ // / Bitfield-extract.
192
+ // /
193
+ // / \tparam T The type of \param source value, must be an integer.
194
+ // / \param source The source value to extracting.
195
+ // / \param bit_start The position to start extracting.
196
+ // / \param num_bits The number of bits to extracting.
197
+ template <typename T>
198
+ inline T bfe (const T source, const uint32_t bit_start,
199
+ const uint32_t num_bits) {
200
+ static_assert (std::is_unsigned_v<T>);
201
+ // FIXME(syclcompat-lib-reviewers): This ternary was added to catch a case
202
+ // which may be undefined anyway. Consider that we are losing perf here.
203
+ const T mask =
204
+ num_bits >= CHAR_BIT * sizeof (T) ? T{-1 } : ((T{1 } << num_bits) - 1 );
205
+ return (source >> bit_start) & mask;
206
+ }
207
+
189
208
} // namespace detail
190
209
210
+ // / Bitfield-extract with boundary checking.
211
+ // /
212
+ // / Extract bit field from \param source and return the zero or sign-extended
213
+ // / result. Source \param bit_start gives the bit field starting bit position,
214
+ // / and source \param num_bits gives the bit field length in bits.
215
+ // /
216
+ // / The result is padded with the sign bit of the extracted field. If `num_bits`
217
+ // / is zero, the result is zero. If the start position is beyond the msb of the
218
+ // / input, the result is filled with the replicated sign bit of the extracted
219
+ // / field.
220
+ // /
221
+ // / \tparam T The type of \param source value, must be an integer.
222
+ // / \param source The source value to extracting.
223
+ // / \param bit_start The position to start extracting.
224
+ // / \param num_bits The number of bits to extracting.
225
+ template <typename T>
226
+ inline T bfe_safe (const T source, const uint32_t bit_start,
227
+ const uint32_t num_bits) {
228
+ static_assert (std::is_integral_v<T>);
229
+ #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
230
+ if constexpr (std::is_same_v<T, int8_t > || std::is_same_v<T, int16_t > ||
231
+ std::is_same_v<T, int32_t >) {
232
+ int32_t res{};
233
+ asm volatile (" bfe.s32 %0, %1, %2, %3;"
234
+ : " =r" (res)
235
+ : " r" ((int32_t )source), " r" (bit_start), " r" (num_bits));
236
+ return res;
237
+ } else if constexpr (std::is_same_v<T, uint8_t > ||
238
+ std::is_same_v<T, uint16_t > ||
239
+ std::is_same_v<T, uint32_t >) {
240
+ uint32_t res{};
241
+ asm volatile (" bfe.u32 %0, %1, %2, %3;"
242
+ : " =r" (res)
243
+ : " r" ((uint32_t )source), " r" (bit_start), " r" (num_bits));
244
+ return res;
245
+ } else if constexpr (std::is_same_v<T, int64_t >) {
246
+ T res{};
247
+ asm volatile (" bfe.s64 %0, %1, %2, %3;"
248
+ : " =l" (res)
249
+ : " l" (source), " r" (bit_start), " r" (num_bits));
250
+ return res;
251
+ } else if constexpr (std::is_same_v<T, uint64_t >) {
252
+ T res{};
253
+ asm volatile (" bfe.u64 %0, %1, %2, %3;"
254
+ : " =l" (res)
255
+ : " l" (source), " r" (bit_start), " r" (num_bits));
256
+ return res;
257
+ }
258
+ #endif
259
+ const uint32_t bit_width = CHAR_BIT * sizeof (T);
260
+ const uint32_t pos = std::min (bit_start, bit_width);
261
+ const uint32_t len = std::min (pos + num_bits, bit_width) - pos;
262
+ if constexpr (std::is_signed_v<T>) {
263
+ // FIXME(syclcompat-lib-reviewers): As above, catching a case whose result
264
+ // is undefined and likely losing perf.
265
+ const T mask = len >= bit_width ? T{-1 } : static_cast <T>((T{1 } << len) - 1 );
266
+
267
+ // Find the sign-bit, the result is padded with the sign bit of the
268
+ // extracted field.
269
+ // Note if requested num_bits==0, we return zero via sign_bit=0
270
+ const uint32_t sign_bit_pos = std::min (pos + len - 1 , bit_width - 1 );
271
+ const T sign_bit = num_bits != 0 && ((source >> sign_bit_pos) & 1 );
272
+ const T sign_bit_padding = (-sign_bit & ~mask);
273
+ return ((source >> pos) & mask) | sign_bit_padding;
274
+ } else {
275
+ return syclcompat::detail::bfe (source, pos, len);
276
+ }
277
+ }
278
+
279
+ namespace detail {
280
+ // FIXME(syclcompat-lib-reviewers): move bfi outside detail once perf is
281
+ // improved & semantics understood
282
+ // / Bitfield-insert.
283
+ // /
284
+ // / \tparam T The type of \param x and \param y , must be an unsigned integer.
285
+ // / \param x The source of the bitfield.
286
+ // / \param y The source where bitfield is inserted.
287
+ // / \param bit_start The position to start insertion.
288
+ // / \param num_bits The number of bits to insertion.
289
+ template <typename T>
290
+ inline T bfi (const T x, const T y, const uint32_t bit_start,
291
+ const uint32_t num_bits) {
292
+ static_assert (std::is_unsigned_v<T>);
293
+ constexpr unsigned bit_width = CHAR_BIT * sizeof (T);
294
+
295
+ // if bit_start > bit_width || len == 0, should return y.
296
+ const T ignore_bfi = static_cast <T>(bit_start > bit_width || num_bits == 0 );
297
+ T extract_bitfield_mask = (static_cast <T>(~T{0 }) >> (bit_width - num_bits))
298
+ << bit_start;
299
+ T clean_bitfield_mask = ~extract_bitfield_mask;
300
+ return (y & (-ignore_bfi | clean_bitfield_mask)) |
301
+ (~-ignore_bfi & ((x << bit_start) & extract_bitfield_mask));
302
+ }
303
+ } // namespace detail
304
+
305
+ // / Bitfield-insert with boundary checking.
306
+ // /
307
+ // / Align and insert a bit field from \param x into \param y . Source \param
308
+ // / bit_start gives the starting bit position for the insertion, and source
309
+ // / \param num_bits gives the bit field length in bits.
310
+ // /
311
+ // / \tparam T The type of \param x and \param y , must be an unsigned integer.
312
+ // / \param x The source of the bitfield.
313
+ // / \param y The source where bitfield is inserted.
314
+ // / \param bit_start The position to start insertion.
315
+ // / \param num_bits The number of bits to insertion.
316
+ template <typename T>
317
+ inline T bfi_safe (const T x, const T y, const uint32_t bit_start,
318
+ const uint32_t num_bits) {
319
+ static_assert (std::is_unsigned_v<T>);
320
+ #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
321
+ if constexpr (std::is_same_v<T, uint8_t > || std::is_same_v<T, uint16_t > ||
322
+ std::is_same_v<T, uint32_t >) {
323
+ uint32_t res{};
324
+ asm volatile (" bfi.b32 %0, %1, %2, %3, %4;"
325
+ : " =r" (res)
326
+ : " r" ((uint32_t )x), " r" ((uint32_t )y), " r" (bit_start),
327
+ " r" (num_bits));
328
+ return res;
329
+ } else if constexpr (std::is_same_v<T, uint64_t >) {
330
+ uint64_t res{};
331
+ asm volatile (" bfi.b64 %0, %1, %2, %3, %4;"
332
+ : " =l" (res)
333
+ : " l" (x), " l" (y), " r" (bit_start), " r" (num_bits));
334
+ return res;
335
+ }
336
+ #endif
337
+ constexpr unsigned bit_width = CHAR_BIT * sizeof (T);
338
+ const uint32_t pos = std::min (bit_start, bit_width);
339
+ const uint32_t len = std::min (pos + num_bits, bit_width) - pos;
340
+ return syclcompat::detail::bfi (x, y, pos, len);
341
+ }
342
+
191
343
// / Emulated function for __funnelshift_l
192
344
inline unsigned int funnelshift_l (unsigned int low, unsigned int high,
193
345
unsigned int shift) {
0 commit comments