intel · kbobrovs · Jul 8, 2021 · Jul 2, 2021 · Jul 6, 2021
@@ -167,8 +167,8 @@ bitonic_exchange4(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
   simd<uint32_t, BASE_SZ> B;
 #pragma unroll
   for (int i = 0; i < BASE_SZ; i += 32) {
-    auto MA = A.select<32, 1>(i).format<uint32_t, 4, 8>();
-    auto MB = B.select<32, 1>(i).format<uint32_t, 4, 8>();
+    auto MA = A.select<32, 1>(i).bit_cast_view<uint32_t, 4, 8>();
+    auto MB = B.select<32, 1>(i).bit_cast_view<uint32_t, 4, 8>();
     MB.select<4, 1, 4, 1>(0, 0) = MA.select<4, 1, 4, 1>(0, 4);
     MB.select<4, 1, 4, 1>(0, 4) = MA.select<4, 1, 4, 1>(0, 0);
     B.select<32, 1>(i).merge(A.select<32, 1>(i),
@@ -196,8 +196,8 @@ bitonic_exchange2(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
   simd<uint32_t, BASE_SZ> B;
 #pragma unroll
   for (int i = 0; i < BASE_SZ; i += 32) {
-    auto MB = B.select<32, 1>(i).format<long long, 4, 4>();
-    auto MA = A.select<32, 1>(i).format<long long, 4, 4>();
+    auto MB = B.select<32, 1>(i).bit_cast_view<long long, 4, 4>();
+    auto MA = A.select<32, 1>(i).bit_cast_view<long long, 4, 4>();
     MB.select<4, 1, 2, 2>(0, 0) = MA.select<4, 1, 2, 2>(0, 1);
     MB.select<4, 1, 2, 2>(0, 1) = MA.select<4, 1, 2, 2>(0, 0);
     B.select<32, 1>(i).merge(A.select<32, 1>(i),
@@ -326,8 +326,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
   simd<ushort, 32> flip16(init_mask16);
 #pragma unroll
   for (int i = 0; i < BASE_SZ; i += 32) {
-    auto MA = A.select<32, 1>(i).format<uint32_t, 4, 8>();
-    auto MB = B.select<32, 1>(i).format<uint32_t, 4, 8>();
+    auto MA = A.select<32, 1>(i).bit_cast_view<uint32_t, 4, 8>();
+    auto MB = B.select<32, 1>(i).bit_cast_view<uint32_t, 4, 8>();
     MA.select<4, 1, 4, 1>(0, 0) = MB.select<4, 1, 4, 1>(0, 4);
     MA.select<4, 1, 4, 1>(0, 4) = MB.select<4, 1, 4, 1>(0, 0);
     bool dir_up = (((offset + i) >> (m + 1)) & 1) == 0;
@@ -346,8 +346,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
   simd<ushort, 32> flip18(init_mask18);
 #pragma unroll
   for (int i = 0; i < BASE_SZ; i += 32) {
-    auto MB = B.select<32, 1>(i).format<long long, 4, 4>();
-    auto MA = A.select<32, 1>(i).format<long long, 4, 4>();
+    auto MB = B.select<32, 1>(i).bit_cast_view<long long, 4, 4>();
+    auto MA = A.select<32, 1>(i).bit_cast_view<long long, 4, 4>();
 
     MB.select<4, 1, 2, 2>(0, 0) = MA.select<4, 1, 2, 2>(0, 1);
     MB.select<4, 1, 2, 2>(0, 1) = MA.select<4, 1, 2, 2>(0, 0);

@@ -84,8 +84,8 @@ bitonic_exchange4(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
   simd<uint32_t, BASE_SZ> B;
 #pragma unroll
   for (int i = 0; i < BASE_SZ; i += 32) {
-    auto MA = A.select<32, 1>(i).format<uint32_t, 4, 8>();
-    auto MB = B.select<32, 1>(i).format<uint32_t, 4, 8>();
+    auto MA = A.select<32, 1>(i).bit_cast_view<uint32_t, 4, 8>();
+    auto MB = B.select<32, 1>(i).bit_cast_view<uint32_t, 4, 8>();
     MB.select<4, 1, 4, 1>(0, 0) = MA.select<4, 1, 4, 1>(0, 4);
     MB.select<4, 1, 4, 1>(0, 4) = MA.select<4, 1, 4, 1>(0, 0);
     B.select<32, 1>(i).merge(A.select<32, 1>(i),
@@ -113,8 +113,8 @@ bitonic_exchange2(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
   simd<uint32_t, BASE_SZ> B;
 #pragma unroll
   for (int i = 0; i < BASE_SZ; i += 32) {
-    auto MB = B.select<32, 1>(i).format<long long, 4, 4>();
-    auto MA = A.select<32, 1>(i).format<long long, 4, 4>();
+    auto MB = B.select<32, 1>(i).bit_cast_view<long long, 4, 4>();
+    auto MA = A.select<32, 1>(i).bit_cast_view<long long, 4, 4>();
     MB.select<4, 1, 2, 2>(0, 0) = MA.select<4, 1, 2, 2>(0, 1);
     MB.select<4, 1, 2, 2>(0, 1) = MA.select<4, 1, 2, 2>(0, 0);
     B.select<32, 1>(i).merge(A.select<32, 1>(i),
@@ -243,8 +243,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
   simd<ushort, 32> flip16 = esimd_unpack_mask<32>(0x0f0f0f0f); //(init_mask16);
 #pragma unroll
   for (int i = 0; i < BASE_SZ; i += 32) {
-    auto MA = A.select<32, 1>(i).format<uint32_t, 4, 8>();
-    auto MB = B.select<32, 1>(i).format<uint32_t, 4, 8>();
+    auto MA = A.select<32, 1>(i).bit_cast_view<uint32_t, 4, 8>();
+    auto MB = B.select<32, 1>(i).bit_cast_view<uint32_t, 4, 8>();
     MA.select<4, 1, 4, 1>(0, 0) = MB.select<4, 1, 4, 1>(0, 4);
     MA.select<4, 1, 4, 1>(0, 4) = MB.select<4, 1, 4, 1>(0, 0);
     bool dir_up = (((offset + i) >> (m + 1)) & 1) == 0;
@@ -263,8 +263,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
   simd<ushort, 32> flip18 = esimd_unpack_mask<32>(0x33333333); //(init_mask18);
 #pragma unroll
   for (int i = 0; i < BASE_SZ; i += 32) {
-    auto MB = B.select<32, 1>(i).format<long long, 4, 4>();
-    auto MA = A.select<32, 1>(i).format<long long, 4, 4>();
+    auto MB = B.select<32, 1>(i).bit_cast_view<long long, 4, 4>();
+    auto MA = A.select<32, 1>(i).bit_cast_view<long long, 4, 4>();
 
     MB.select<4, 1, 2, 2>(0, 0) = MA.select<4, 1, 2, 2>(0, 1);
     MB.select<4, 1, 2, 2>(0, 1) = MA.select<4, 1, 2, 2>(0, 0);

@@ -22,11 +22,11 @@
 #define TUPLE_SZ 1
 
 #if TUPLE_SZ == 1
-#define GATHER_SCATTER_MASK ESIMD_R_ENABLE
+#define GATHER_SCATTER_MASK rgba_channel_mask::R
 #elif TUPLE_SZ == 2
-#define GATHER_SCATTER_MASK ESIMD_GR_ENABLE
+#define GATHER_SCATTER_MASK rgba_channel_mask::GR
 #elif TUPLE_SZ == 4
-#define GATHER_SCATTER_MASK ESIMD_ABGR_ENABLE
+#define GATHER_SCATTER_MASK rgba_channel_mask::ABGR
 #endif
 
 #define LOG_ENTRIES 8
@@ -148,7 +148,7 @@ void cmk_acum_iterative(unsigned *buf, unsigned h_pos,
     S += T;
   }
 
-  auto cnt_table = S.format<unsigned int, 32, TUPLE_SZ>();
+  auto cnt_table = S.bit_cast_view<unsigned int, 32, TUPLE_SZ>();
   // sum reduction for each bin
   cnt_table.select<16, 1, TUPLE_SZ, 1>(0, 0) +=
       cnt_table.select<16, 1, TUPLE_SZ, 1>(16, 0);
@@ -186,7 +186,7 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
 
     S = gather4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset, p);
 
-    auto cnt_table = S.format<unsigned int, TUPLE_SZ, 32>();
+    auto cnt_table = S.bit_cast_view<unsigned int, TUPLE_SZ, 32>();
     cnt_table.column(0) += prev;
 #pragma unroll
     for (unsigned j = 0; j < TUPLE_SZ; j++) {
@@ -254,7 +254,7 @@ void cmk_prefix_iterative(unsigned *buf, unsigned h_pos,
 
     S = gather4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
 
-    auto cnt_table = S.format<unsigned int, TUPLE_SZ, 32>();
+    auto cnt_table = S.bit_cast_view<unsigned int, TUPLE_SZ, 32>();
     cnt_table.column(0) += prev;
 #pragma unroll
     for (unsigned j = 0; j < TUPLE_SZ; j++) {

@@ -22,11 +22,11 @@
 #define TUPLE_SZ 1
 
 #if TUPLE_SZ == 1
-#define GATHER_SCATTER_MASK ESIMD_R_ENABLE
+#define GATHER_SCATTER_MASK rgba_channel_mask::R
 #elif TUPLE_SZ == 2
-#define GATHER_SCATTER_MASK ESIMD_GR_ENABLE
+#define GATHER_SCATTER_MASK rgba_channel_mask::GR
 #elif TUPLE_SZ == 4
-#define GATHER_SCATTER_MASK ESIMD_ABGR_ENABLE
+#define GATHER_SCATTER_MASK rgba_channel_mask::ABGR
 #endif
 
 #define PREFIX_ENTRIES 256
@@ -87,7 +87,7 @@ void cmk_sum_tuple_count(unsigned int *buf, unsigned int h_pos) {
   }
 
   // format S to be a 32xTUPLE_SZ matrix
-  auto cnt_table = S.format<unsigned int, 32, TUPLE_SZ>();
+  auto cnt_table = S.bit_cast_view<unsigned int, 32, TUPLE_SZ>();
   // sum reduction for each bin
   cnt_table.select<16, 1, TUPLE_SZ, 1>(0, 0) +=
       cnt_table.select<16, 1, TUPLE_SZ, 1>(16, 0);

@@ -22,11 +22,11 @@
 #define TUPLE_SZ 4
 
 #if TUPLE_SZ == 1
-#define GATHER_SCATTER_MASK ESIMD_R_ENABLE
+#define GATHER_SCATTER_MASK rgba_channel_mask::R
 #elif TUPLE_SZ == 2
-#define GATHER_SCATTER_MASK ESIMD_GR_ENABLE
+#define GATHER_SCATTER_MASK rgba_channel_mask::GR
 #elif TUPLE_SZ == 4
-#define GATHER_SCATTER_MASK ESIMD_ABGR_ENABLE
+#define GATHER_SCATTER_MASK rgba_channel_mask::ABGR
 #endif
 
 #define PREFIX_ENTRIES 256
@@ -83,7 +83,7 @@ void cmk_acum_iterative(unsigned *buf, unsigned h_pos,
     S += T;
   }
 
-  auto cnt_table = S.format<unsigned int, TUPLE_SZ, 32>();
+  auto cnt_table = S.bit_cast_view<unsigned int, TUPLE_SZ, 32>();
 
   simd<unsigned, TUPLE_SZ> sum;
 #pragma unroll

@@ -22,11 +22,11 @@
 #define TUPLE_SZ 2
 
 #if TUPLE_SZ == 1
-#define GATHER_SCATTER_MASK ESIMD_R_ENABLE
+#define GATHER_SCATTER_MASK rgba_channel_mask::R
 #elif TUPLE_SZ == 2
-#define GATHER_SCATTER_MASK ESIMD_GR_ENABLE
+#define GATHER_SCATTER_MASK rgba_channel_mask::GR
 #elif TUPLE_SZ == 4
-#define GATHER_SCATTER_MASK ESIMD_ABGR_ENABLE
+#define GATHER_SCATTER_MASK rgba_channel_mask::ABGR
 #endif
 
 #define LOG_ENTRIES 8
@@ -110,7 +110,7 @@ void cmk_acum_iterative(unsigned *buf, unsigned h_pos,
     S += T;
   }
 
-  auto cnt_table = S.format<unsigned int, 32, TUPLE_SZ>();
+  auto cnt_table = S.bit_cast_view<unsigned int, 32, TUPLE_SZ>();
   // sum reduction for each bin
   cnt_table.select<16, 1, TUPLE_SZ, 1>(0, 0) +=
       cnt_table.select<16, 1, TUPLE_SZ, 1>(16, 0);
@@ -161,7 +161,7 @@ void cmk_acum_iterative_low(unsigned *buf, unsigned h_pos,
     S += T;
   }
 
-  auto cnt_table = S.format<unsigned int, 32, TUPLE_SZ>();
+  auto cnt_table = S.bit_cast_view<unsigned int, 32, TUPLE_SZ>();
   // sum reduction for each bin
   cnt_table.select<16, 1, TUPLE_SZ, 1>(0, 0) +=
       cnt_table.select<16, 1, TUPLE_SZ, 1>(16, 0);
@@ -199,7 +199,7 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
 
     S = gather4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset, p);
 
-    auto cnt_table = S.format<unsigned int, TUPLE_SZ, 32>();
+    auto cnt_table = S.bit_cast_view<unsigned int, TUPLE_SZ, 32>();
     cnt_table.column(0) += prev;
     for (unsigned j = 0; j < TUPLE_SZ; j++) {
       // step 1

@@ -113,7 +113,7 @@ int main(int argc, char *argv[]) {
 
             simd<float, (HEIGHT + 10) * 32> vin;
             // matrix HEIGHT+10 x 32
-            auto in = vin.format<float, HEIGHT + 10, 32>();
+            auto in = vin.bit_cast_view<float, HEIGHT + 10, 32>();
 
             //
             // rather than loading all data in

@@ -199,8 +199,8 @@ int main(int argc, char *argv[]) {
                 src = histogram.select<8, 1>(i);
 
 #ifdef __SYCL_DEVICE_ONLY__
-                flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, unsigned int, 8>(
-                    bins, offset, src, 1);
+                flat_atomic<atomic_op::add, unsigned int, 8>(bins, offset, src,
+                                                             1);
                 offset += 8 * sizeof(unsigned int);
 #else
                 simd<unsigned int, 8> vals;

@@ -45,14 +45,14 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
     auto start_addr = ((unsigned int *)input_ptr) + start_off;
     simd<uint, 32> data;
     data.copy_from(start_addr);
-    auto in = data.format<uchar>();
+    auto in = data.bit_cast_view<uchar>();
 
 #pragma unroll
     for (int j = 0; j < BLOCK_WIDTH * sizeof(int); j += 16) {
       // Accumulate local histogram for each pixel value
       simd<uint, 16> dataOffset = in.select<16, 1>(j).read();
       dataOffset *= sizeof(int);
-      slm_atomic<EsimdAtomicOpType::ATOMIC_INC, uint, 16>(dataOffset, 1);
+      slm_atomic<atomic_op::inc, uint, 16>(dataOffset, 1);
     }
     start_off += BLOCK_WIDTH;
   }
@@ -61,10 +61,10 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
   // Update global sum by atomically adding each local histogram
   simd<uint, 16> local_histogram;
   local_histogram = slm_load<uint32_t, 16>(slm_offset);
-  flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, 8>(
-      output, slm_offset.select<8, 1>(0), local_histogram.select<8, 1>(0), 1);
-  flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, 8>(
-      output, slm_offset.select<8, 1>(8), local_histogram.select<8, 1>(8), 1);
+  flat_atomic<atomic_op::add, uint32_t, 8>(output, slm_offset.select<8, 1>(0),
+                                           local_histogram.select<8, 1>(0), 1);
+  flat_atomic<atomic_op::add, uint32_t, 8>(output, slm_offset.select<8, 1>(8),
+                                           local_histogram.select<8, 1>(8), 1);
 }
 
 // This function calculates histogram of the image with the CPU.

@@ -46,14 +46,14 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
     auto start_addr = ((unsigned int *)input_ptr) + start_off;
     simd<uint, 32> data;
     data.copy_from(start_addr);
-    auto in = data.format<uchar>();
+    auto in = data.bit_cast_view<uchar>();
 
 #pragma unroll
     for (int j = 0; j < BLOCK_WIDTH * sizeof(int); j += 16) {
       // Accumulate local histogram for each pixel value
       simd<uint, 16> dataOffset = in.select<16, 1>(j).read();
       dataOffset *= sizeof(int);
-      slm_atomic<EsimdAtomicOpType::ATOMIC_INC, uint, 16>(dataOffset, 1);
+      slm_atomic<atomic_op::inc, uint, 16>(dataOffset, 1);
     }
     start_off += BLOCK_WIDTH;
   }
@@ -62,10 +62,10 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
   // Update global sum by atomically adding each local histogram
   simd<uint, 16> local_histogram;
   local_histogram = slm_load<uint32_t, 16>(slm_offset);
-  flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, 8>(
-      output, slm_offset.select<8, 1>(0), local_histogram.select<8, 1>(0), 1);
-  flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, 8>(
-      output, slm_offset.select<8, 1>(8), local_histogram.select<8, 1>(8), 1);
+  flat_atomic<atomic_op::add, uint32_t, 8>(output, slm_offset.select<8, 1>(0),
+                                           local_histogram.select<8, 1>(0), 1);
+  flat_atomic<atomic_op::add, uint32_t, 8>(output, slm_offset.select<8, 1>(8),
+                                           local_histogram.select<8, 1>(8), 1);
 }
 
 // This function calculates histogram of the image with the CPU.

@@ -40,14 +40,14 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
   for (int y = 0; y < num_blocks; y++) {
     auto start_addr = ((unsigned int *)input_ptr) + start_off;
     auto data = block_load<uint, 32>(start_addr);
-    auto in = data.format<uchar>();
+    auto in = data.bit_cast_view<uchar>();
 
 #pragma unroll
     for (int j = 0; j < BLOCK_WIDTH * sizeof(int); j += 16) {
       // Accumulate local histogram for each pixel value
       simd<uint, 16> dataOffset = in.select<16, 1>(j).read();
       dataOffset *= sizeof(int);
-      slm_atomic<EsimdAtomicOpType::ATOMIC_INC, uint, 16>(dataOffset, 1);
+      slm_atomic<atomic_op::inc, uint, 16>(dataOffset, 1);
     }
     start_off += BLOCK_WIDTH;
   }
@@ -56,10 +56,10 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
   // Update global sum by atomically adding each local histogram
   simd<uint, 16> local_histogram;
   local_histogram = slm_load<uint32_t, 16>(slm_offset);
-  flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, 8>(
-      output, slm_offset.select<8, 1>(0), local_histogram.select<8, 1>(0), 1);
-  flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, 8>(
-      output, slm_offset.select<8, 1>(8), local_histogram.select<8, 1>(8), 1);
+  flat_atomic<atomic_op::add, uint32_t, 8>(output, slm_offset.select<8, 1>(0),
+                                           local_histogram.select<8, 1>(0), 1);
+  flat_atomic<atomic_op::add, uint32_t, 8>(output, slm_offset.select<8, 1>(8),
+                                           local_histogram.select<8, 1>(8), 1);
 }
 
 // This function calculates histogram of the image with the CPU.

@@ -191,8 +191,8 @@ int main(int argc, char *argv[]) {
               src = histogram.select<8, 1>(i);
 
 #ifdef __SYCL_DEVICE_ONLY__
-              flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, unsigned int, 8>(
-                  bins, offset, src, 1);
+              flat_atomic<atomic_op::add, unsigned int, 8>(bins, offset, src,
+                                                           1);
               offset += 8 * sizeof(unsigned int);
 #else
               simd<unsigned int, 8> vals;

@@ -214,10 +214,10 @@ int main(int argc, char *argv[]) {
               src = histogram.select<8, 1>(i);
 
 #ifdef __SYCL_DEVICE_ONLY__
-              // flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, unsigned int,
+              // flat_atomic<atomic_op::add, unsigned int,
               // 8>(bins, offset, src, 1);
-              atomic_write<EsimdAtomicOpType::ATOMIC_ADD, unsigned int, 8>(
-                  bins, offset, src, 1);
+              atomic_write<atomic_op::add, unsigned int, 8>(bins, offset, src,
+                                                            1);
               offset += 8 * sizeof(unsigned int);
 #else
               simd<unsigned int, 8> vals;