myler
diff --git a/‎SYCL/ESIMD/lsc/Inputs/lsc_slm_load.hpp
Lines changed: 144 additions & 0 deletions b/‎SYCL/ESIMD/lsc/Inputs/lsc_slm_load.hpp
Lines changed: 144 additions & 0 deletions
diff --git a/‎SYCL/ESIMD/lsc/Inputs/lsc_slm_store.hpp
Lines changed: 165 additions & 0 deletions b/‎SYCL/ESIMD/lsc/Inputs/lsc_slm_store.hpp
Lines changed: 165 additions & 0 deletions
diff --git a/‎SYCL/ESIMD/lsc/lsc_block_store_u64.cpp
Lines changed: 27 additions & 0 deletions b/‎SYCL/ESIMD/lsc/lsc_block_store_u64.cpp
Lines changed: 27 additions & 0 deletions
diff --git a/‎SYCL/ESIMD/lsc/lsc_slm_block_load.cpp
Lines changed: 37 additions & 0 deletions b/‎SYCL/ESIMD/lsc/lsc_slm_block_load.cpp
Lines changed: 37 additions & 0 deletions
@@ -0,0 +1,144 @@
+#include <CL/sycl.hpp>
+#include <sycl/ext/intel/esimd.hpp>
+
+#include <iostream>
+
+using namespace cl::sycl;
+using namespace sycl::ext::intel::esimd;
+using namespace sycl::ext::intel::experimental::esimd;
+
+// TODO: The SPEC does not say what values are returned for lsc_slm_gather
+// when the corresponding elements of the predicate/mask is zero.
+// It is assumed to be undefined values there.
+// Thus this test does not check those elements now. From the API point of view
+// it may be better to have another argument for the values being copied to
+// the result when the mask bit is 0.
+
+template <int CaseNum, typename T, uint32_t Groups, uint32_t LocalRange,
+          uint16_t VL, uint16_t NChannels, bool Transpose,
+          lsc_data_size DS = lsc_data_size::default_size>
+bool test(uint32_t PMask = ~0) {
+  static_assert((NChannels == 1) || !Transpose,
+                "Transpose must have exec size 1");
+  if constexpr (DS == lsc_data_size::u8u32 || DS == lsc_data_size::u16u32) {
+    static_assert(!Transpose, "Conversion types may not use vector");
+    static_assert(NChannels == 1, "Only D32 and D64 support vector load");
+  }
+
+  static_assert(DS != lsc_data_size::u16u32h, "D16U32h not supported in HW");
+  static_assert(sizeof(T) >= 4,
+                "D8 and D16 are valid only in 2D block load/store");
+
+  if constexpr (!Transpose && NChannels > 1) {
+    static_assert(VL == 16 || VL == 32,
+                  "IGC prohibits execution size less than SIMD size when "
+                  "vector size is greater than 1");
+  }
+
+  T VMask = static_cast<T>(-1);
+  if constexpr (DS == lsc_data_size::u8u32)
+    VMask = static_cast<T>(0xff);
+  else if constexpr (DS == lsc_data_size::u16u32)
+    VMask = static_cast<T>(0xffff);
+  else if constexpr (DS == lsc_data_size::u16u32h)
+    VMask = static_cast<T>(0xffff0000);
+
+  queue Q(gpu_selector{});
+  auto D = Q.get_device();
+  std::cout << "Running case #" << CaseNum << " on "
+            << D.get_info<info::device::name>() << std::endl;
+
+  nd_range<1> Range{range<1>{Groups * LocalRange}, range<1>{LocalRange}};
+  constexpr uint16_t OutSize = Groups * LocalRange * VL * NChannels;
+  T *Out = malloc_shared<T>(OutSize, Q);
+  memset(Out, 0, OutSize * sizeof(T));
+
+  try {
+    Q.submit([&](handler &cgh) {
+       cgh.parallel_for(Range, [=](sycl::nd_item<1> NDId) SYCL_ESIMD_KERNEL {
+         uint32_t GID = NDId.get_global_id(0);
+         uint32_t LID = NDId.get_local_id(0);
+         uint32_t GroupID = NDId.get_group_linear_id();
+
+         // Allocate and init 128-byte multiple size SLM memory with
+         // consequential values. i-th group gets values:
+         // {0, 1, 2, ...} + GroupID * 1000000.
+         constexpr uint32_t ResultSIMDByteSize = VL * NChannels * sizeof(T);
+         constexpr uint32_t SLMSize =
+             (ResultSIMDByteSize * LocalRange + 127) & ~127;
+         slm_init(SLMSize);
+         if (NDId.get_local_id(0) == 0) {
+           simd<T, 4> Vals(GroupID * 1000000, 1);
+           for (int I = 0; I < SLMSize; I += 4 * sizeof(T)) {
+             slm_block_store<T, 4>(I, Vals);
+             Vals += 4;
+           }
+         }
+         barrier();
+
+         if constexpr (Transpose) {
+           auto Vals = lsc_slm_block_load<T, VL, DS>(LID * VL * sizeof(T));
+           Vals.copy_to(Out + GID * VL);
+         } else {
+           simd<uint32_t, VL> Offsets(LID * VL * NChannels * sizeof(T),
+                                      NChannels * sizeof(T));
+
+           // Create  the predicate for the gather from 'PMask'.
+           simd_mask<VL> Pred;
+           for (int I = 0; I < VL; I++)
+             Pred.template select<1, 1>(I) = (PMask >> I) & 1;
+
+           simd<T, VL *NChannels> Vals =
+               lsc_slm_gather<T, NChannels, DS>(Offsets, Pred);
+
+           Vals.copy_to(Out + GID * VL * NChannels);
+         }
+       });
+     }).wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    sycl::free(Out, Q);
+    return false;
+  }
+
+  bool Passed = true;
+
+  if constexpr (Transpose) {
+    for (uint32_t I = 0; I < OutSize; I++) {
+      uint32_t GroupId = I / (LocalRange * VL * NChannels);
+      uint32_t LID = I % (LocalRange * VL * NChannels);
+      T ExpectedVal = GroupId * 1000000 + LID;
+      if (Out[I] != ExpectedVal) {
+        Passed = false;
+        std::cout << I << ": Value = " << Out[I]
+                  << ", Expected value = " << ExpectedVal << std::endl;
+      }
+    }
+  } else {
+    for (uint32_t I = 0; I < OutSize; I += VL * NChannels) {
+      uint32_t GroupId = I / (LocalRange * VL * NChannels);
+      uint32_t LID = I % (LocalRange * VL * NChannels);
+      T ExpectedValBase = GroupId * 1000000 + LID;
+      for (int ChannelId = 0; ChannelId < NChannels; ChannelId++) {
+        for (int J = 0; J < VL; J++) {
+          uint32_t OutIndex = I + ChannelId * VL + J;
+
+          if (((PMask >> J) & 1) == 0)
+            continue;
+          T ExpectedVal = (ExpectedValBase + ChannelId + J * NChannels) & VMask;
+          if (Out[OutIndex] != ExpectedVal) {
+            Passed = false;
+            std::cout << OutIndex << ": Value = " << Out[OutIndex]
+                      << ", Expected value = " << ExpectedVal << std::endl;
+          }
+        }
+      }
+    }
+  }
+
+  sycl::free(Out, Q);
+
+  if (!Passed)
+    std::cout << "Case #" << CaseNum << " FAILED" << std::endl;
+  return Passed;
+}
@@ -0,0 +1,165 @@
+#include <CL/sycl.hpp>
+#include <sycl/ext/intel/esimd.hpp>
+
+#include <iostream>
+
+using namespace cl::sycl;
+using namespace sycl::ext::intel::esimd;
+using namespace sycl::ext::intel::experimental::esimd;
+
+template <int CaseNum, typename T, uint32_t Groups, uint32_t LocalRange,
+          uint16_t VL, uint16_t NChannels, bool Transpose,
+          lsc_data_size DS = lsc_data_size::default_size>
+bool test(uint32_t PMask = ~0) {
+  static_assert((NChannels == 1) || !Transpose,
+                "Transpose must have exec size 1");
+  if constexpr (DS == lsc_data_size::u8u32 || DS == lsc_data_size::u16u32) {
+    static_assert(!Transpose, "Conversion types may not use vector");
+    static_assert(NChannels == 1, "Only D32 and D64 support vector load");
+  }
+
+  static_assert(DS != lsc_data_size::u16u32h, "D16U32h not supported in HW");
+  static_assert(sizeof(T) >= 4,
+                "D8 and D16 are valid only in 2D block load/store");
+
+  if constexpr (!Transpose && NChannels > 1) {
+    static_assert(VL == 16 || VL == 32,
+                  "IGC prohibits execution size less than SIMD size when "
+                  "vector size is greater than 1");
+  }
+
+  T VMask = static_cast<T>(-1);
+  if constexpr (DS == lsc_data_size::u8u32)
+    VMask = static_cast<T>(0xff);
+  else if constexpr (DS == lsc_data_size::u16u32)
+    VMask = static_cast<T>(0xffff);
+  else if constexpr (DS == lsc_data_size::u16u32h)
+    VMask = static_cast<T>(0xffff0000);
+
+  queue Q(gpu_selector{});
+  auto D = Q.get_device();
+  std::cout << "Running case #" << CaseNum << " on "
+            << D.get_info<info::device::name>() << std::endl;
+
+  nd_range<1> Range{range<1>{Groups * LocalRange}, range<1>{LocalRange}};
+  constexpr uint16_t OutSize = Groups * LocalRange * VL * NChannels;
+  T *Out = malloc_shared<T>(OutSize, Q);
+  memset(Out, 0, OutSize * sizeof(T));
+
+  try {
+    Q.submit([&](handler &cgh) {
+       cgh.parallel_for(Range, [=](sycl::nd_item<1> NDId) SYCL_ESIMD_KERNEL {
+         uint32_t GID = NDId.get_global_id(0);
+         uint32_t LID = NDId.get_local_id(0);
+         uint32_t GroupID = NDId.get_group_linear_id();
+
+         // 1. Allocate and init 128-byte multiple size SLM memory with special
+         // values.
+         constexpr uint32_t ResultSIMDByteSize = VL * NChannels * sizeof(T);
+         constexpr uint32_t SLMSize =
+             (ResultSIMDByteSize * LocalRange + 127) & ~127;
+         slm_init(SLMSize);
+         if (NDId.get_local_id(0) == 0) {
+           simd<T, 4> Vals = static_cast<T>(0xBAADF00DBAADF00D);
+           for (int I = 0; I < SLMSize; I += 4 * sizeof(T))
+             slm_block_store<T, 4>(I, Vals);
+         }
+         barrier();
+
+         // 2. Use STORE intrinscis that are being verified in this test.
+         if constexpr (Transpose) {
+           simd<T, VL> Vals(GroupID * 1000000 + LID * 1000, 1);
+           lsc_slm_block_store<T, VL, DS>(LID * VL * sizeof(T), Vals);
+         } else {
+
+           // Create  the predicate for the gather from 'PMask'.
+           simd_mask<VL> Pred;
+           for (int I = 0; I < VL; I++)
+             Pred.template select<1, 1>(I) = (PMask >> I) & 1;
+
+           simd<T, VL * NChannels> Vals(GroupID * 1000000 + LID * 1000, 1);
+           simd<uint32_t, VL> Offsets(LID * VL * NChannels * sizeof(T),
+                                      NChannels * sizeof(T));
+           lsc_slm_scatter<T, NChannels, DS>(Offsets, Vals, Pred);
+         }
+         barrier();
+
+         // 3. Simply load the content of SLM and store it to USM.
+         if (NDId.get_local_id(0) == 0) {
+           int End = LocalRange * VL * NChannels;
+           for (int I = 0; I < End; I += 4) {
+             auto Vals = slm_block_load<T, 4>(I * sizeof(T));
+
+             // If 'VL' is small, simd<T, 4> cannot be safely used
+             if (I + 4 > End) {
+               for (int J = 0; J + I < End; J++)
+                 Out[GroupID * LocalRange * VL * NChannels + I + J] =
+                     (T)Vals[J];
+             } else {
+               Vals.copy_to(Out + GroupID * LocalRange * VL * NChannels + I);
+             }
+           }
+         }
+       });
+     }).wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    sycl::free(Out, Q);
+    return false;
+  }
+
+  bool Passed = true;
+
+  if constexpr (Transpose) {
+    for (uint32_t I = 0; I < OutSize; I++) {
+      uint32_t GroupId = I / (LocalRange * VL);
+      uint32_t LID = I / VL % LocalRange;
+      T ExpectedVal = GroupId * 1000000 + LID * 1000 + I % VL;
+      if (Out[I] != ExpectedVal) {
+        Passed = false;
+        std::cout << I << ": Value = " << Out[I]
+                  << ", Expected value = " << ExpectedVal << std::endl;
+      }
+    }
+  } else {
+    for (uint32_t I = 0; I < OutSize; I += VL * NChannels) {
+      uint32_t GroupId = I / (LocalRange * VL * NChannels);
+      uint32_t LID = I / (VL * NChannels) % LocalRange;
+      T ExpectedValBase = GroupId * 1000000 + LID * 1000 + I % (VL * NChannels);
+      T ExpectedValInc = 0;
+      uint32_t MaskIndex = 0;
+      uint32_t MaskIndexTimer = 0;
+      for (int ChannelId = 0; ChannelId < NChannels; ChannelId++) {
+        for (int J = 0; J < VL; J++) {
+          uint32_t OutIndex = I + ChannelId * VL + J;
+          T ExpectedVal = ((PMask >> MaskIndex) & 1)
+                              ? (ExpectedValBase + ExpectedValInc)
+                              : static_cast<T>(0xBAADF00DBAADF00D);
+          ExpectedVal &= VMask;
+          MaskIndexTimer++;
+          if (MaskIndexTimer >= NChannels) {
+            MaskIndexTimer = 0;
+            MaskIndex++;
+          }
+
+          ExpectedValInc += VL;
+          if (ExpectedValInc >= VL * NChannels)
+            ExpectedValInc = (ExpectedValInc % (VL * NChannels)) + 1;
+
+          T OutVal = Out[OutIndex] & VMask;
+          if (OutVal != ExpectedVal) {
+            Passed = false;
+            std::cout << OutIndex << ": Value = " << Out[OutIndex]
+                      << ", Expected value = " << ExpectedVal << std::endl;
+          }
+        }
+      }
+    }
+  }
+
+  sycl::free(Out, Q);
+
+  if (!Passed)
+    std::cout << "Case #" << CaseNum << " FAILED" << std::endl;
+  return Passed;
+}
@@ -0,0 +1,27 @@
+//==------------ lsc_block_store_u64.cpp - DPC++ ESIMD on-device test ------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc
+// UNSUPPORTED: cuda || hip
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+#include "Inputs/lsc_block_store.hpp"
+
+constexpr uint32_t seed = 363;
+using T = uint64_t;
+
+int main(void) {
+  srand(seed);
+  bool passed = true;
+
+  passed &= test<1, T, 1, 1, 8, 8>(11, 20, 14, 3, 11);
+  passed &= test<2, T, 2, 2, 2, 2>(3, 3, 8, 1, 1);
+
+  std::cout << (passed ? "Passed\n" : "FAILED\n");
+  return passed ? 0 : 1;
+}
@@ -0,0 +1,37 @@
+// REQUIRES: gpu-intel-pvc
+// UNSUPPORTED: cuda || hip
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// This test verifies the correctness of LSC intrinsics loading
+// from SLM memory.
+
+#include "Inputs/lsc_slm_load.hpp"
+
+// This test verifies the correctness of LSC SLM block load intrinsics.
+
+// Id - test id.
+// NGroups - number of work groups.
+// LocalSize - number work items in each work group.
+// VL - number of offsets used in the gather operation.
+template <int Id, int NGroups, int LocalSize, int VL> bool test_load() {
+  bool Passed = true;
+  Passed &= test<Id, uint32_t, NGroups, LocalSize, VL, 1, true>();
+  Passed &= test<Id + 1, uint64_t, NGroups, LocalSize, VL, 1, true>();
+  return Passed;
+}
+
+int main() {
+  bool Passed = true;
+
+  // test_load<Id, NGroups, LocalSize, VL>();
+  Passed &= test_load<0, 1, 1, 4>();
+  Passed &= test_load<2, 1, 7, 16>();
+  Passed &= test_load<4, 4, 7, 16>();
+  Passed &= test_load<6, 16, 8, 8>();
+  Passed &= test_load<8, 2, 4, 32>();
+  Passed &= test_load<10, 2, 4, 64>();
+
+  std::cout << (Passed ? "Passed" : "FAILED") << std::endl;
+  return Passed ? 0 : 1;
+}