Threadpool implementation

PietroGhg · PietroGhg · commit 06c05d83f600 · 2024-04-11T09:35:49.000+01:00
diff --git a/source/adapters/native_cpu/device.cpp b/source/adapters/native_cpu/device.cpp
@@ -98,7 +98,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_LINKER_AVAILABLE:
     return ReturnValue(bool{false});
   case UR_DEVICE_INFO_MAX_COMPUTE_UNITS:
-    return ReturnValue(uint32_t{256});
+    return ReturnValue(static_cast<uint32_t>(
+          hDevice->tp.num_threads()));
   case UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES:
     return ReturnValue(uint32_t{0});
   case UR_DEVICE_INFO_SUPPORTED_PARTITIONS:
diff --git a/source/adapters/native_cpu/device.hpp b/source/adapters/native_cpu/device.hpp
@@ -11,9 +11,17 @@
 #pragma once
 
 #include <ur/ur.hpp>
+#include "threadpool.hpp"
 
 struct ur_device_handle_t_ {
-  ur_device_handle_t_(ur_platform_handle_t ArgPlt) : Platform(ArgPlt) {}
+  native_cpu::threadpool_t tp;
+  ur_device_handle_t_(ur_platform_handle_t ArgPlt) : Platform(ArgPlt) {
+    tp.start();
+  }
+
+  ~ur_device_handle_t_() {
+    tp.stop();
+  }
 
   ur_platform_handle_t Platform;
 };
diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
@@ -1,9 +1,7 @@
 //===----------- enqueue.cpp - NATIVE CPU Adapter -------------------------===//
 //
-// Copyright (C) 2023 Intel Corporation
-//
-// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
-// Exceptions. See LICENSE.TXT
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
@@ -15,6 +13,8 @@
 #include "common.hpp"
 #include "kernel.hpp"
 #include "memory.hpp"
+#include "threadpool.hpp"
+#include "queue.hpp"
 
 namespace native_cpu {
 struct NDRDescT {
@@ -61,14 +61,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
 
   // TODO: add proper error checking
   // TODO: add proper event dep management
-  native_cpu::NDRDescT ndr(workDim, pGlobalWorkOffset, pGlobalWorkSize,
-                           pLocalWorkSize);
-  hKernel->handleLocalArgs();
+  native_cpu::NDRDescT ndr(workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize);
+  auto& tp = hQueue->device->tp;
+  const size_t numParallelThreads = tp.num_threads();
+  hKernel->updateMemPool(numParallelThreads);
+  std::vector<std::future<void>> futures;
+  auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
+  auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
+  auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2];
+  bool isLocalSizeOne =
+      ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1;
+  
 
   native_cpu::state state(ndr.GlobalSize[0], ndr.GlobalSize[1],
                           ndr.GlobalSize[2], ndr.LocalSize[0], ndr.LocalSize[1],
                           ndr.LocalSize[2], ndr.GlobalOffset[0],
                           ndr.GlobalOffset[1], ndr.GlobalOffset[2]);
+  if (isLocalSizeOne) {
+    // If the local size is one, we make the assumption that we are running a
+    // parallel_for over a sycl::range Todo: we could add compiler checks and
+    // kernel properties for this (e.g. check that no barriers are called, no
+    // local memory args).
 
   auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
   auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
@@ -92,6 +105,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       }
     }
   }
+
+  for (auto &f : futures)
+    f.get();
   // TODO: we should avoid calling clear here by avoiding using push_back
   // in setKernelArgs.
   hKernel->_args.clear();
@@ -537,3 +553,4 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe(
 
   DIE_NO_IMPLEMENTATION;
 }
+
diff --git a/source/adapters/native_cpu/kernel.hpp b/source/adapters/native_cpu/kernel.hpp
@@ -1,9 +1,7 @@
 //===--------------- kernel.hpp - Native CPU Adapter ----------------------===//
 //
-// Copyright (C) 2023 Intel Corporation
-//
-// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
-// Exceptions. See LICENSE.TXT
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
@@ -42,50 +40,53 @@ struct ur_kernel_handle_t_ : RefCounted {
   ur_kernel_handle_t_(const char *name, nativecpu_task_t subhandler)
       : _name{name}, _subhandler{std::move(subhandler)} {}
 
-  const char *_name;
-  nativecpu_task_t _subhandler;
-  std::vector<native_cpu::NativeCPUArgDesc> _args;
-  std::vector<local_arg_info_t> _localArgInfo;
-
-  // To be called before enqueing the kernel.
-  void handleLocalArgs() {
-    updateMemPool();
-    size_t offset = 0;
-    for (auto &entry : _localArgInfo) {
-      _args[entry.argIndex].MPtr =
-          reinterpret_cast<char *>(_localMemPool) + offset;
-      // update offset in the memory pool
-      // Todo: update this offset computation when we have work-group
-      // level parallelism.
-      offset += entry.argSize;
-    }
+  ur_kernel_handle_t_(const ur_kernel_handle_t_& other) : _name(other._name), _subhandler(other._subhandler), 
+  _args(other._args), _localArgInfo(other._localArgInfo), _localMemPool(other._localMemPool), _localMemPoolSize(other._localMemPoolSize) {
+    incrementReferenceCount();
   }
 
   ~ur_kernel_handle_t_() {
-    if (_localMemPool) {
+    decrementReferenceCount();
+    if (_refCount == 0) {
       free(_localMemPool);
     }
+  
   }
 
-private:
-  void updateMemPool() {
+  const char *_name;
+  nativecpu_task_t _subhandler;
+  std::vector<native_cpu::NativeCPUArgDesc> _args;
+  std::vector<local_arg_info_t> _localArgInfo;
+
+  // To be called before enqueing the kernel.
+  void updateMemPool(size_t numParallelThreads) {
     // compute requested size.
-    // Todo: currently we execute only one work-group at a time, so for each
-    // local arg we can allocate just 1 * argSize local arg. When we implement
-    // work-group level parallelism we should allocate N * argSize where N is
-    // the number of work groups being executed in parallel (e.g. number of
-    // threads in the thread pool).
     size_t reqSize = 0;
     for (auto &entry : _localArgInfo) {
-      reqSize += entry.argSize;
+      reqSize += entry.argSize * numParallelThreads;
     }
     if (reqSize == 0 || reqSize == _localMemPoolSize) {
       return;
     }
     // realloc handles nullptr case
-    _localMemPool = realloc(_localMemPool, reqSize);
+    _localMemPool = (char*)realloc(_localMemPool, reqSize);
     _localMemPoolSize = reqSize;
   }
-  void *_localMemPool = nullptr;
+
+  // To be called before executing a work group
+  void handleLocalArgs(size_t numParallelThread, size_t threadId) {
+    // For each local argument we have size*numthreads
+    size_t offset = 0;
+    for (auto &entry : _localArgInfo) {
+      _args[entry.argIndex].MPtr =
+          _localMemPool + offset + (entry.argSize * threadId);
+      // update offset in the memory pool
+      offset += entry.argSize * numParallelThread;
+    }
+  }
+
+private:
+  char* _localMemPool = nullptr;
   size_t _localMemPoolSize = 0;
 };
+
diff --git a/source/adapters/native_cpu/queue.cpp b/source/adapters/native_cpu/queue.cpp
@@ -35,10 +35,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(
   std::ignore = hDevice;
   std::ignore = pProperties;
 
-  auto Queue = new ur_queue_handle_t_();
+  auto Queue = new ur_queue_handle_t_(hDevice);
   *phQueue = Queue;
 
-  CONTINUE_NO_IMPLEMENTATION;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) {
diff --git a/source/adapters/native_cpu/queue.hpp b/source/adapters/native_cpu/queue.hpp
@@ -9,5 +9,11 @@
 //===----------------------------------------------------------------------===//
 #pragma once
 #include "common.hpp"
+#include "device.hpp"
 
-struct ur_queue_handle_t_ : RefCounted {};
+struct ur_queue_handle_t_ : RefCounted {
+  ur_device_handle_t_ *device;
+
+  ur_queue_handle_t_(ur_device_handle_t_ *device) : device(device) {}
+
+};
diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp