@@ -14302,6 +14302,241 @@ named_barrier_signal(uint8_t barrier_id, uint8_t producer_consumer_mode,
14302
14302
14303
14303
/// @} sycl_esimd_memory_nbarrier
14304
14304
14305
+ /// @defgroup sycl_esimd_mask_compressed Mask compressed APIs.
14306
+ /// @ingroup sycl_esimd_memory
14307
+
14308
+ /// @addtogroup sycl_esimd_mask_compressed
14309
+ /// @{
14310
+
14311
+ /// template <typename T, int N,
14312
+ /// typename PropertyListT = oneapi::experimental::empty_properties_t>
14313
+ /// simd<T, N>
14314
+ /// mask_expand_load(const T *p, simd_mask<N> mask, PropertyListT props = {});
14315
+ /// Mask expand load from USM memory location.
14316
+ /// The function reads data from a memory location using following algorithm:
14317
+ ///
14318
+ /// \code{.cpp}
14319
+ ///
14320
+ /// int Index = 0;
14321
+ /// for (int i = 0; i < N; ++i) {
14322
+ /// if (Mask[i]) {
14323
+ /// Result[i] = *(p + Index);
14324
+ /// ++Index;
14325
+ /// }
14326
+ /// }
14327
+ /// \endcode
14328
+ ///
14329
+ ///
14330
+ /// @tparam T is the element type.
14331
+ /// @tparam N is the data size.
14332
+ /// @param p is the base address for this operation.
14333
+ /// @param mask is the mask determining which elements will be read.
14334
+ /// @param props The compile-time properties. Only cache hint
14335
+ /// properties are used.
14336
+ ///
14337
+ template <typename T, int N,
14338
+ typename PropertyListT = oneapi::experimental::empty_properties_t>
14339
+ __ESIMD_API std::enable_if_t<
14340
+ ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
14341
+ mask_expand_load(const T *p, simd_mask<N> mask, PropertyListT props = {}) {
14342
+ // offsets::value contains binary masks that for every location at index i it
14343
+ // contains i 1's i.e. 0,1,3,7,...
14344
+ using offsets = typename detail::GenerateCompressedBitmask<N>::value;
14345
+ // Performing '&' operation with packed mask will leave at index i a bitmask
14346
+ // with number of 1's corresponding to a number of elements to be loaded so
14347
+ // far (number of 1's in the mask preceding the index i). Number of 1's
14348
+ // becomes an index for compressed store/expanded load operation.
14349
+ simd<uint32_t, N> offset =
14350
+ cbit(simd<uint32_t, N>(offsets::value) & pack_mask(mask));
14351
+ return gather(p, offset * sizeof(T), mask, props);
14352
+ }
14353
+
14354
+ /// template <typename T, int N, typename AccessorTy,
14355
+ /// typename PropertyListT = oneapi::experimental::empty_properties_t>
14356
+ /// simd<T, N>
14357
+ /// mask_expand_load(AccessorTy acc, simd_mask<N> mask, PropertyListT props =
14358
+ /// {});
14359
+ /// Mask expand load from accessor memory (could be local or device
14360
+ /// accessor). The function reads data from a memory location using following
14361
+ /// algorithm:
14362
+ ///
14363
+ /// \code{.cpp}
14364
+ ///
14365
+ /// int Index = 0;
14366
+ /// for (int i = 0; i < N; ++i) {
14367
+ /// if (Mask[i])
14368
+ /// Result[i] = acc[global_offset + Index++];
14369
+ /// }
14370
+ /// \endcode
14371
+ ///
14372
+ ///
14373
+ /// @tparam T is the element type.
14374
+ /// @tparam N is the data size.
14375
+ /// @param acc is the accessor to read from.
14376
+ /// @param global_offset is the global offset in bytes.
14377
+ /// @param mask is the mask determining which elements will be read.
14378
+ /// @param props The compile-time properties. Only cache hint
14379
+ /// properties are used.
14380
+ ///
14381
+ template <typename T, int N, typename AccessorTy,
14382
+ typename PropertyListT = oneapi::experimental::empty_properties_t>
14383
+ __ESIMD_API std::enable_if_t<
14384
+ ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
14385
+ detail::is_accessor_with_v<AccessorTy,
14386
+ detail::accessor_mode_cap::can_read>,
14387
+ simd<T, N>>
14388
+ mask_expand_load(AccessorTy acc, uint32_t global_offset, simd_mask<N> mask,
14389
+ PropertyListT props = {}) {
14390
+ // offsets::value contains binary masks that for every location at index i it
14391
+ // contains i 1's i.e. 0,1,3,7,...
14392
+ using offsets = typename detail::GenerateCompressedBitmask<N>::value;
14393
+ // Performing '&' operation with packed mask will leave at index i a bitmask
14394
+ // with number of 1's corresponding to a number of elements to be loaded so
14395
+ // far (number of 1's in the mask preceding the index i). Number of 1's
14396
+ // becomes an index for compressed store/expanded load operation.
14397
+ simd<uint32_t, N> offset =
14398
+ cbit(simd<uint32_t, N>(offsets::value) & pack_mask(mask));
14399
+ return gather<T>(acc, offset * sizeof(T) + global_offset, mask, props);
14400
+ }
14401
+
14402
+ /// template <typename T, int N,
14403
+ /// typename PropertyListT = oneapi::experimental::empty_properties_t>
14404
+ /// simd<T, N>
14405
+ /// mask_compress_store(T *p, simd<T, N> vals, simd_mask<N> mask,
14406
+ /// PropertyListT props = {});
14407
+ /// Mask compressed store to USM memory location.
14408
+ /// The function stores data to a memory location using following algorithm:
14409
+ ///
14410
+ /// \code{.cpp}
14411
+ ///
14412
+ /// int Index = 0;
14413
+ /// for (int i = 0; i < N; ++i) {
14414
+ /// if (Mask[i]) {
14415
+ /// *(p + Index) = val[i];
14416
+ /// ++Index;
14417
+ /// }
14418
+ /// }
14419
+ /// \endcode
14420
+ ///
14421
+ ///
14422
+ /// @tparam T is the element type.
14423
+ /// @tparam N is the data size.
14424
+ /// @param p is the base address for this operation.
14425
+ /// @param vals is the data to store.
14426
+ /// @param mask is the mask determining which elements will be stored.
14427
+ /// @param props The compile-time properties. Only cache hint
14428
+ /// properties are used.
14429
+ ///
14430
+ template <typename T, int N,
14431
+ typename PropertyListT = oneapi::experimental::empty_properties_t>
14432
+ __ESIMD_API std::enable_if_t<
14433
+ ext::oneapi::experimental::is_property_list_v<PropertyListT>>
14434
+ mask_compress_store(T *p, simd<T, N> vals, simd_mask<N> mask,
14435
+ PropertyListT props = {}) {
14436
+ // offsets::value contains binary masks that for every location at index i it
14437
+ // contains i 1's i.e. 0,1,3,7,...
14438
+ using offsets = typename detail::GenerateCompressedBitmask<N>::value;
14439
+ // Performing '&' operation with packed mask will leave at index i a bitmask
14440
+ // with number of 1's corresponding to a number of elements to be loaded so
14441
+ // far (number of 1's in the mask preceding the index i). Number of 1's
14442
+ // becomes an index for compressed store/expanded load operation.
14443
+ simd<uint32_t, N> offset =
14444
+ cbit(simd<uint32_t, N>(offsets::value) & pack_mask(mask));
14445
+ scatter(p, offset * sizeof(T), vals, mask, props);
14446
+ }
14447
+
14448
+ /// template <typename T, int N, typename AccessorTy,
14449
+ /// typename PropertyListT = oneapi::experimental::empty_properties_t>
14450
+ /// simd<T, N>
14451
+ /// mask_compress_store(AccessorTy acc, simd<T, N> vals, simd_mask<N> mask,
14452
+ /// PropertyListT props = {});
14453
+ /// Mask compressed store to accessor memory (could be local or device
14454
+ /// accessor).
14455
+ /// The function stores data to a memory location using following algorithm:
14456
+ ///
14457
+ /// \code{.cpp}
14458
+ ///
14459
+ /// int Index = 0;
14460
+ /// for (int i = 0; i < N; ++i) {
14461
+ /// if (Mask[i])
14462
+ /// acc[global_offset + Index++] = val[i];
14463
+ /// }
14464
+ /// \endcode
14465
+ ///
14466
+ ///
14467
+ /// @tparam T is the element type.
14468
+ /// @tparam N is the data size.
14469
+ /// @param acc is the accessor to write to.
14470
+ /// @param global_offset is the global offset in bytes.
14471
+ /// @param vals is the data to store.
14472
+ /// @param mask is the mask determining which elements will be stored.
14473
+ /// @param props The compile-time properties. Only cache hint
14474
+ /// properties are used.
14475
+ ///
14476
+ template <typename T, int N, typename AccessorTy,
14477
+ typename PropertyListT = oneapi::experimental::empty_properties_t>
14478
+ __ESIMD_API std::enable_if_t<
14479
+ ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
14480
+ detail::is_accessor_with_v<AccessorTy,
14481
+ detail::accessor_mode_cap::can_write>>
14482
+ mask_compress_store(AccessorTy acc, uint32_t global_offset, simd<T, N> vals,
14483
+ simd_mask<N> mask, PropertyListT props = {}) {
14484
+ // offsets::value contains binary masks that for every location at index i it
14485
+ // contains i 1's i.e. 0,1,3,7,...
14486
+ using offsets = typename detail::GenerateCompressedBitmask<N>::value;
14487
+ // Performing '&' operation with packed mask will leave at index i a bitmask
14488
+ // with number of 1's corresponding to a number of elements to be loaded so
14489
+ // far (number of 1's in the mask preceding the index i). Number of 1's
14490
+ // becomes an index for compressed store/expanded load operation.
14491
+ simd<uint32_t, N> offset =
14492
+ cbit(simd<uint32_t, N>(offsets::value) & pack_mask(mask));
14493
+ scatter<T, N>(acc, offset * sizeof(T) + global_offset, vals, mask, props);
14494
+ }
14495
+
14496
+ /// template <typename T, int N, int M>
14497
+ /// simd<T, N>
14498
+ /// mask_compress_store(simd<T, M> &dst, uint32_t global_offset, simd<T, N>
14499
+ /// vals, simd_mask<N> mask);
14500
+ /// Mask compressed store to another vector. The function reads data to a
14501
+ /// vector using following algorithm:
14502
+ ///
14503
+ /// \code{.cpp}
14504
+ ///
14505
+ /// int Index = 0;
14506
+ /// for (int i = 0; i < N; ++i) {
14507
+ /// if (Mask[i])
14508
+ /// dst[global_offset + Index++] = vals[i];
14509
+ /// }
14510
+ /// \endcode
14511
+ ///
14512
+ ///
14513
+ /// @tparam T is the element type.
14514
+ /// @tparam N is the data size.
14515
+ /// @tparam M is the source data size.
14516
+ /// @param dst is the vector to write to.
14517
+ /// @param global_offset is an offset to use for all writes.
14518
+ /// @param vals is the data to store.
14519
+ /// @param mask is the mask determining which elements will be stored.
14520
+ ///
14521
+ template <typename T, int N, int M>
14522
+ __ESIMD_API std::enable_if_t<M >= N>
14523
+ mask_compress_store(simd<T, M> &dst, uint32_t global_offset, simd<T, N> vals,
14524
+ simd_mask<N> mask) {
14525
+ // offsets::value contains binary masks that for every location at index i it
14526
+ // contains i 1's i.e. 0,1,3,7,...
14527
+ using offsets = typename detail::GenerateCompressedBitmask<N>::value;
14528
+ // Performing '&' operation with packed mask will leave at index i a bitmask
14529
+ // with number of 1's corresponding to a number of elements to be loaded so
14530
+ // far (number of 1's in the mask preceding the index i). Number of 1's
14531
+ // becomes an index for compressed store/expanded load operation.
14532
+ simd<uint32_t, N> offset =
14533
+ cbit(simd<uint32_t, N>(offsets::value) & pack_mask(mask));
14534
+
14535
+ simd<uint16_t, N> Indices = global_offset + offset;
14536
+ dst.iupdate(Indices, vals, mask);
14537
+ }
14538
+
14539
+ /// @} sycl_esimd_mask_compressed
14305
14540
/// @} sycl_esimd_memory
14306
14541
14307
14542
/// @cond EXCLUDE
0 commit comments