[SYCL][COMPAT] Add bfe_safe and bfi_safe APIs (#14006)

joeatodd · Alcpz · web-flow · commit 6ac0a3fb5437 · 2024-06-11T12:57:18.000+01:00
This PR adds bit-field extract (`bfe_safe`) and bit-field insert
(`bfi_safe`) to the `math.hpp` header.

These are 'bounds checked' variants of the `detail::bfe` and
`detail::bfi` APIs respectively, though in addition to bounds checking
the `_safe` variants also provide:
 - asm for NVPTX
 - Proper treatment of signed types (`bfe_safe`)

As such, it's not clear whether the 'unsafe' variants ought to be
exposed at all and so I've put them in `detail::` for now. What are the
expected semantics in relation to the `_safe` variants? They would
likely need separate tests, and it's not clear that DPCT use these.

---------

Signed-off-by: Joe Todd &lt;joe.todd@codeplay.com&gt;
Co-authored-by: Alberto Cabrera Pérez &lt;alberto.cabrera@intel.com&gt;
diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md
@@ -2118,6 +2118,48 @@ template <typename RetT, typename AT, typename BT>
 inline constexpr RetT extend_vavrg2_sat(AT a, BT b, RetT c);
 ```
 
+The math header file provides APIs for bit-field insertion (`bfi_safe`) and
+bit-field extraction (`bfe_safe`). These are bounds-checked variants of
+underlying `detail` APIs (`detail::bfi`, `detail::bfe`) which, in future
+releases, will be exposed to the user.
+
+```c++
+
+/// Bitfield-insert with boundary checking.
+///
+/// Align and insert a bit field from \param x into \param y . Source \param
+/// bit_start gives the starting bit position for the insertion, and source
+/// \param num_bits gives the bit field length in bits.
+///
+/// \tparam T The type of \param x and \param y , must be an unsigned integer.
+/// \param x The source of the bitfield.
+/// \param y The source where bitfield is inserted.
+/// \param bit_start The position to start insertion.
+/// \param num_bits The number of bits to insertion.
+template <typename T>
+inline T bfi_safe(const T x, const T y, const uint32_t bit_start,
+                  const uint32_t num_bits);
+
+/// Bitfield-extract with boundary checking.
+///
+/// Extract bit field from \param source and return the zero or sign-extended
+/// result. Source \param bit_start gives the bit field starting bit position,
+/// and source \param num_bits gives the bit field length in bits.
+///
+/// The result is padded with the sign bit of the extracted field. If `num_bits`
+/// is zero, the  result is zero. If the start position is beyond the msb of the
+/// input, the result is filled with the replicated sign bit of the extracted
+/// field.
+///
+/// \tparam T The type of \param source value, must be an integer.
+/// \param source The source value to extracting.
+/// \param bit_start The position to start extracting.
+/// \param num_bits The number of bits to extracting.
+template <typename T>
+inline T bfe_safe(const T source, const uint32_t bit_start,
+                  const uint32_t num_bits);
+```
+
 ## Sample Code
 
 Below is a simple linear algebra sample, which computes `y = mx + b` implemented
diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
@@ -186,8 +186,160 @@ inline bool isnan(const sycl::ext::oneapi::bfloat16 a) {
 }
 #endif
 
+// FIXME(syclcompat-lib-reviewers): move bfe outside detail once perf is
+// improved & semantics understood
+/// Bitfield-extract.
+///
+/// \tparam T The type of \param source value, must be an integer.
+/// \param source The source value to extracting.
+/// \param bit_start The position to start extracting.
+/// \param num_bits The number of bits to extracting.
+template <typename T>
+inline T bfe(const T source, const uint32_t bit_start,
+             const uint32_t num_bits) {
+  static_assert(std::is_unsigned_v<T>);
+  // FIXME(syclcompat-lib-reviewers): This ternary was added to catch a case
+  // which may be undefined anyway. Consider that we are losing perf here.
+  const T mask =
+      num_bits >= CHAR_BIT * sizeof(T) ? T{-1} : ((T{1} << num_bits) - 1);
+  return (source >> bit_start) & mask;
+}
+
 } // namespace detail
 
+/// Bitfield-extract with boundary checking.
+///
+/// Extract bit field from \param source and return the zero or sign-extended
+/// result. Source \param bit_start gives the bit field starting bit position,
+/// and source \param num_bits gives the bit field length in bits.
+///
+/// The result is padded with the sign bit of the extracted field. If `num_bits`
+/// is zero, the  result is zero. If the start position is beyond the msb of the
+/// input, the result is filled with the replicated sign bit of the extracted
+/// field.
+///
+/// \tparam T The type of \param source value, must be an integer.
+/// \param source The source value to extracting.
+/// \param bit_start The position to start extracting.
+/// \param num_bits The number of bits to extracting.
+template <typename T>
+inline T bfe_safe(const T source, const uint32_t bit_start,
+                  const uint32_t num_bits) {
+  static_assert(std::is_integral_v<T>);
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  if constexpr (std::is_same_v<T, int8_t> || std::is_same_v<T, int16_t> ||
+                std::is_same_v<T, int32_t>) {
+    int32_t res{};
+    asm volatile("bfe.s32 %0, %1, %2, %3;"
+                 : "=r"(res)
+                 : "r"((int32_t)source), "r"(bit_start), "r"(num_bits));
+    return res;
+  } else if constexpr (std::is_same_v<T, uint8_t> ||
+                       std::is_same_v<T, uint16_t> ||
+                       std::is_same_v<T, uint32_t>) {
+    uint32_t res{};
+    asm volatile("bfe.u32 %0, %1, %2, %3;"
+                 : "=r"(res)
+                 : "r"((uint32_t)source), "r"(bit_start), "r"(num_bits));
+    return res;
+  } else if constexpr (std::is_same_v<T, int64_t>) {
+    T res{};
+    asm volatile("bfe.s64 %0, %1, %2, %3;"
+                 : "=l"(res)
+                 : "l"(source), "r"(bit_start), "r"(num_bits));
+    return res;
+  } else if constexpr (std::is_same_v<T, uint64_t>) {
+    T res{};
+    asm volatile("bfe.u64 %0, %1, %2, %3;"
+                 : "=l"(res)
+                 : "l"(source), "r"(bit_start), "r"(num_bits));
+    return res;
+  }
+#endif
+  const uint32_t bit_width = CHAR_BIT * sizeof(T);
+  const uint32_t pos = std::min(bit_start, bit_width);
+  const uint32_t len = std::min(pos + num_bits, bit_width) - pos;
+  if constexpr (std::is_signed_v<T>) {
+    // FIXME(syclcompat-lib-reviewers): As above, catching a case whose result
+    // is undefined and likely losing perf.
+    const T mask = len >= bit_width ? T{-1} : static_cast<T>((T{1} << len) - 1);
+
+    // Find the sign-bit, the result is padded with the sign bit of the
+    // extracted field.
+    // Note if requested num_bits==0, we return zero via sign_bit=0
+    const uint32_t sign_bit_pos = std::min(pos + len - 1, bit_width - 1);
+    const T sign_bit = num_bits != 0 && ((source >> sign_bit_pos) & 1);
+    const T sign_bit_padding = (-sign_bit & ~mask);
+    return ((source >> pos) & mask) | sign_bit_padding;
+  } else {
+    return syclcompat::detail::bfe(source, pos, len);
+  }
+}
+
+namespace detail {
+// FIXME(syclcompat-lib-reviewers): move bfi outside detail once perf is
+// improved & semantics understood
+/// Bitfield-insert.
+///
+/// \tparam T The type of \param x and \param y , must be an unsigned integer.
+/// \param x The source of the bitfield.
+/// \param y The source where bitfield is inserted.
+/// \param bit_start The position to start insertion.
+/// \param num_bits The number of bits to insertion.
+template <typename T>
+inline T bfi(const T x, const T y, const uint32_t bit_start,
+             const uint32_t num_bits) {
+  static_assert(std::is_unsigned_v<T>);
+  constexpr unsigned bit_width = CHAR_BIT * sizeof(T);
+
+  // if bit_start > bit_width || len == 0, should return y.
+  const T ignore_bfi = static_cast<T>(bit_start > bit_width || num_bits == 0);
+  T extract_bitfield_mask = (static_cast<T>(~T{0}) >> (bit_width - num_bits))
+                            << bit_start;
+  T clean_bitfield_mask = ~extract_bitfield_mask;
+  return (y & (-ignore_bfi | clean_bitfield_mask)) |
+         (~-ignore_bfi & ((x << bit_start) & extract_bitfield_mask));
+}
+} // namespace detail
+
+/// Bitfield-insert with boundary checking.
+///
+/// Align and insert a bit field from \param x into \param y . Source \param
+/// bit_start gives the starting bit position for the insertion, and source
+/// \param num_bits gives the bit field length in bits.
+///
+/// \tparam T The type of \param x and \param y , must be an unsigned integer.
+/// \param x The source of the bitfield.
+/// \param y The source where bitfield is inserted.
+/// \param bit_start The position to start insertion.
+/// \param num_bits The number of bits to insertion.
+template <typename T>
+inline T bfi_safe(const T x, const T y, const uint32_t bit_start,
+                  const uint32_t num_bits) {
+  static_assert(std::is_unsigned_v<T>);
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  if constexpr (std::is_same_v<T, uint8_t> || std::is_same_v<T, uint16_t> ||
+                std::is_same_v<T, uint32_t>) {
+    uint32_t res{};
+    asm volatile("bfi.b32 %0, %1, %2, %3, %4;"
+                 : "=r"(res)
+                 : "r"((uint32_t)x), "r"((uint32_t)y), "r"(bit_start),
+                   "r"(num_bits));
+    return res;
+  } else if constexpr (std::is_same_v<T, uint64_t>) {
+    uint64_t res{};
+    asm volatile("bfi.b64 %0, %1, %2, %3, %4;"
+                 : "=l"(res)
+                 : "l"(x), "l"(y), "r"(bit_start), "r"(num_bits));
+    return res;
+  }
+#endif
+  constexpr unsigned bit_width = CHAR_BIT * sizeof(T);
+  const uint32_t pos = std::min(bit_start, bit_width);
+  const uint32_t len = std::min(pos + num_bits, bit_width) - pos;
+  return syclcompat::detail::bfi(x, y, pos, len);
+}
+
 /// Emulated function for __funnelshift_l
 inline unsigned int funnelshift_l(unsigned int low, unsigned int high,
                                   unsigned int shift) {
diff --git a/sycl/test-e2e/syclcompat/math/math_bfe.cpp b/sycl/test-e2e/syclcompat/math/math_bfe.cpp
@@ -0,0 +1,182 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCLcompat API
+ *
+ *  math_bfe.cpp
+ *
+ *  Description:
+ *    math bitfield extract tests
+ **************************************************************************/
+
+// ===----------- math_bfe.cpp ------------------ -*- C++ -* --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//
+// ===---------------------------------------------------------------------===//
+
+// RUN: %clangxx -std=c++17 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
+// RUN: %{run} %t.out
+
+#include <bitset>
+#include <chrono>
+#include <iostream>
+#include <limits.h>
+#include <random>
+#include <stdint.h>
+#include <sycl/detail/core.hpp>
+#include <syclcompat/math.hpp>
+#include <type_traits>
+#include <vector>
+
+template <typename T>
+inline std::enable_if_t<std::is_integral_v<T>, T>
+bfe_slow(const T source, const uint32_t bit_start, const uint32_t num_bits) {
+  const uint32_t msb = CHAR_BIT * sizeof(T) - 1;
+  const uint32_t pos = bit_start;
+  const uint32_t len = num_bits;
+
+  // If the requested bit field length is zero, the result is zero.
+  if (num_bits == 0)
+    return 0ULL;
+
+  T sbit;
+  std::bitset<CHAR_BIT * sizeof(T)> source_bitset(source);
+  if (std::is_unsigned_v<T> || len == 0)
+    sbit = 0;
+  else
+    sbit = source_bitset[std::min(pos + len - 1, msb)];
+
+  // If the start position is beyond the msb of the input, the destination d is
+  // filled with the replicated sign bit of the extracted field.
+  // -1 is 1111...
+  if (bit_start > msb)
+    return -sbit;
+
+  std::bitset<CHAR_BIT * sizeof(T)> result_bitset;
+  for (uint8_t i = 0; i <= msb; ++i)
+    result_bitset[i] =
+        (i < len && pos + i <= msb) ? source_bitset[pos + i] : sbit;
+  return result_bitset.to_ullong();
+}
+
+template <typename T> bool test(const char *Msg, int N) {
+  uint32_t bit_width = CHAR_BIT * sizeof(T);
+  T min_value = std::numeric_limits<T>::min();
+  T max_value = std::numeric_limits<T>::max();
+  std::random_device rd;
+  std::mt19937::result_type seed =
+      rd() ^
+      ((std::mt19937::result_type)
+           std::chrono::duration_cast<std::chrono::seconds>(
+               std::chrono::system_clock::now().time_since_epoch())
+               .count() +
+       (std::mt19937::result_type)
+           std::chrono::duration_cast<std::chrono::microseconds>(
+               std::chrono::high_resolution_clock::now().time_since_epoch())
+               .count());
+
+  std::mt19937 gen(seed);
+  std::uniform_int_distribution<T> rd_source(min_value, max_value);
+
+  // Define a small overshoot so that we adequately test out-of-range cases
+  // without sacrificing depth of testing of valid start+length combinations
+  constexpr uint32_t overshoot = 2;
+  std::uniform_int_distribution<uint32_t> rd_start(0, bit_width + overshoot);
+  std::uniform_int_distribution<uint32_t> rd_length(0, bit_width + overshoot);
+
+  std::vector<T> sources(N, 0);
+  std::vector<T> compat_results(N, 0);
+  std::vector<T> slow_results(N, 0);
+  std::vector<uint32_t> starts(N, 0);
+  std::vector<uint32_t> lengths(N, 0);
+  for (int i = 0; i < N; ++i) {
+    sources[i] = rd_source(gen);
+    starts[i] = rd_start(gen);
+    lengths[i] = rd_length(gen);
+  }
+
+  sycl::buffer<T, 1> source_buffer(sources.data(), N);
+  sycl::buffer<T, 1> compat_results_buffer(compat_results.data(), N);
+  sycl::buffer<T, 1> slow_results_buffer(slow_results.data(), N);
+  sycl::buffer<uint32_t, 1> starts_buffer(starts.data(), N);
+  sycl::buffer<uint32_t, 1> lengths_buffer(lengths.data(), N);
+
+  sycl::queue que;
+  que.submit([&](sycl::handler &handler) {
+    sycl::accessor source_accessor(source_buffer, handler, sycl::read_only);
+    sycl::accessor start_accessor(starts_buffer, handler, sycl::read_only);
+    sycl::accessor length_accessor(lengths_buffer, handler, sycl::read_only);
+    sycl::accessor compat_result_accessor(compat_results_buffer, handler,
+                                          sycl::write_only);
+    handler.parallel_for(N, [=](sycl::id<1> i) {
+      compat_result_accessor[i] = syclcompat::bfe_safe<T>(
+          source_accessor[i], start_accessor[i], length_accessor[i]);
+    });
+  });
+
+  que.submit([&](sycl::handler &handler) {
+    sycl::accessor source_accessor(source_buffer, handler, sycl::read_only);
+    sycl::accessor start_accessor(starts_buffer, handler, sycl::read_only);
+    sycl::accessor length_accessor(lengths_buffer, handler, sycl::read_only);
+    sycl::accessor slow_result_accessor(slow_results_buffer, handler,
+                                        sycl::write_only);
+    handler.parallel_for(N, [=](sycl::id<1> i) {
+      slow_result_accessor[i] = bfe_slow<T>(
+          source_accessor[i], start_accessor[i], length_accessor[i]);
+    });
+  });
+
+  que.wait_and_throw();
+  sycl::host_accessor source_accessor(source_buffer, sycl::read_only);
+  sycl::host_accessor start_accessor(starts_buffer, sycl::read_only);
+  sycl::host_accessor length_accessor(lengths_buffer, sycl::read_only);
+  sycl::host_accessor compat_result_accessor(compat_results_buffer,
+                                             sycl::read_only);
+  sycl::host_accessor slow_result_accessor(slow_results_buffer,
+                                           sycl::read_only);
+
+  int failed = 0;
+  for (int i = 0; i < N; ++i) {
+    if (compat_result_accessor[i] != slow_result_accessor[i]) {
+      failed++;
+      std::cout << "[source = " << source_accessor[i]
+                << ", bit_start = " << start_accessor[i]
+                << ", num_bits = " << length_accessor[i] << "] failed, expect "
+                << slow_result_accessor[i] << " but got "
+                << compat_result_accessor[i] << std::endl;
+    }
+  }
+  std::cout << "===============" << std::endl;
+  std::cout << "Test: " << Msg << std::endl;
+  std::cout << "Total: " << N << std::endl;
+  std::cout << "Success: " << N - failed << std::endl;
+  std::cout << "Failed: " << failed << std::endl;
+  std::cout << "===============" << std::endl;
+  return !failed;
+}
+
+int main() {
+  const int N = 1000;
+  assert(test<int16_t>("int16", N));
+  assert(test<uint16_t>("uint16", N));
+  assert(test<int32_t>("int32", N));
+  assert(test<uint32_t>("uint32", N));
+  assert(test<int64_t>("int64", N));
+  assert(test<uint64_t>("uint64", N));
+  return 0;
+}
diff --git a/sycl/test-e2e/syclcompat/math/math_bfi.cpp b/sycl/test-e2e/syclcompat/math/math_bfi.cpp