[SYCL] [NATIVECPU] Implement urKernelSetArgLocal (#11101)

PietroGhg · web-flow · commit 00ec5be66c6a · 2023-09-07T13:14:22.000+01:00
This PR adds support to `local_accessors` by implementing
`urKernelSetArgLocal`.
diff --git a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp
@@ -64,6 +64,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   // TODO: add proper event dep management
   sycl::detail::NDRDescT ndr =
       getNDRDesc(workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize);
+  hKernel->handleLocalArgs();
 
   __nativecpu_state state(ndr.GlobalSize[0], ndr.GlobalSize[1],
                           ndr.GlobalSize[2], ndr.LocalSize[0], ndr.LocalSize[1],
@@ -90,6 +91,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   // TODO: we should avoid calling clear here by avoiding using push_back
   // in setKernelArgs.
   hKernel->_args.clear();
+  hKernel->_localArgInfo.clear();
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/kernel.cpp
@@ -54,12 +54,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue(
 UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgLocal(
     ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize,
     const ur_kernel_arg_local_properties_t *pProperties) {
-  std::ignore = hKernel;
-  std::ignore = argIndex;
-  std::ignore = argSize;
   std::ignore = pProperties;
-
-  DIE_NO_IMPLEMENTATION
+  // emplace a placeholder kernel arg, gets replaced with a pointer to the
+  // memory pool before enqueueing the kernel.
+  hKernel->_args.emplace_back(nullptr);
+  hKernel->_localArgInfo.emplace_back(argIndex, argSize);
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel,
diff --git a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/kernel.hpp
@@ -17,6 +17,13 @@ using nativecpu_kernel_t = void(const sycl::detail::NativeCPUArgDesc *,
 using nativecpu_ptr_t = nativecpu_kernel_t *;
 using nativecpu_task_t = std::function<nativecpu_kernel_t>;
 
+struct local_arg_info_t {
+  uint32_t argIndex;
+  size_t argSize;
+  local_arg_info_t(uint32_t argIndex, size_t argSize)
+      : argIndex(argIndex), argSize(argSize) {}
+};
+
 struct ur_kernel_handle_t_ : RefCounted {
 
   ur_kernel_handle_t_(const char *name, nativecpu_task_t subhandler)
@@ -25,4 +32,47 @@ struct ur_kernel_handle_t_ : RefCounted {
   const char *_name;
   nativecpu_task_t _subhandler;
   std::vector<sycl::detail::NativeCPUArgDesc> _args;
+  std::vector<local_arg_info_t> _localArgInfo;
+
+  // To be called before enqueing the kernel.
+  void handleLocalArgs() {
+    updateMemPool();
+    size_t offset = 0;
+    for (auto &entry : _localArgInfo) {
+      _args[entry.argIndex].MPtr =
+          reinterpret_cast<char *>(_localMemPool) + offset;
+      // update offset in the memory pool
+      // Todo: update this offset computation when we have work-group
+      // level parallelism.
+      offset += entry.argSize;
+    }
+  }
+
+  ~ur_kernel_handle_t_() {
+    if (_localMemPool) {
+      free(_localMemPool);
+    }
+  }
+
+private:
+  void updateMemPool() {
+    // compute requested size.
+    // Todo: currently we execute only one work-group at a time, so for each
+    // local arg we can allocate just 1 * argSize local arg. When we implement
+    // work-group level parallelism we should allocate N * argSize where N is
+    // the number of work groups being executed in parallel (e.g. number of
+    // threads in the thread pool).
+    size_t reqSize = 0;
+    for (auto &entry : _localArgInfo) {
+      reqSize += entry.argSize;
+    }
+    if (reqSize == 0 || reqSize == _localMemPoolSize) {
+      return;
+    }
+    // realloc handles nullptr case
+    _localMemPool = realloc(_localMemPool, reqSize);
+    _localMemPoolSize = reqSize;
+  }
+  void *_localMemPool = nullptr;
+  size_t _localMemPoolSize = 0;
 };
diff --git a/sycl/test/native_cpu/local_basic.cpp b/sycl/test/native_cpu/local_basic.cpp
@@ -0,0 +1,43 @@
+// REQUIRES: native_cpu_be
+// RUN: %clangxx -fsycl -fsycl-targets=native_cpu %s -o %t
+// RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+class Test;
+constexpr sycl::access::mode sycl_write = sycl::access::mode::write;
+int test(queue q, const unsigned localSize) {
+  const unsigned N = localSize;
+  constexpr unsigned NumG = 2;
+  range<1> localR{N};
+  range<1> globalR{NumG * N};
+  buffer<int, 1> Buffer(globalR);
+  q.submit([&](handler &h) {
+    auto acc = Buffer.get_access<sycl_write>(h);
+    local_accessor<int, 1> local_acc1(localR, h);
+    local_accessor<int, 1> local_acc2(localR, h);
+    h.parallel_for<Test>(nd_range<1>{globalR, localR}, [=](nd_item<1> it) {
+      auto lID = it.get_local_id(0);
+      auto gID = it.get_global_id(0);
+      local_acc1[lID] = gID;
+      local_acc2[lID] = gID;
+      acc[gID] = local_acc1[lID] + local_acc2[lID];
+    });
+  });
+  sycl::host_accessor HostAccessor{Buffer, sycl::read_only};
+  for (unsigned i = 0; i < N * NumG; i++) {
+    if (HostAccessor[i] != 2 * i) {
+      std::cout << "Error\n";
+      return 1;
+    }
+  }
+  return 0;
+}
+
+int main() {
+  queue q;
+  auto res1 = test(q, 10);
+  auto res2 = test(q, 20);
+  return res1 || res2;
+}