[OpenMP] New Openmp device RTL functions (#73225)

DominikAdamski · jdoerfert · web-flow · commit d4d88b8499b6 · 2023-11-29T14:25:57.000+01:00
Add new implementation of workshare loop functions.

These functions will be used by OpenMPIRBuilder to support
handling of OpenMP workshare loops for the target region.

---------

Co-authored-by: Johannes Doerfert &lt;johannes@jdoerfert.de&gt;
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -464,6 +464,18 @@ __OMP_RTL(__kmpc_target_deinit, false, Void,)
 __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
           VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)
+__OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
 __OMP_RTL(__kmpc_kernel_parallel, false, Int1, VoidPtrPtr)
 __OMP_RTL(__kmpc_kernel_end_parallel, false, Void, )
 __OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)
@@ -650,6 +662,24 @@ __OMP_RTL_ATTRS(__kmpc_cancel, InaccessibleArgOnlyAttrs, SExt,
                 ParamAttrs(ReadOnlyPtrAttrs, SExt, SExt))
 __OMP_RTL_ATTRS(__kmpc_cancel_barrier, BarrierAttrs, SExt,
                 ParamAttrs(ReadOnlyPtrAttrs, SExt))
+__OMP_RTL_ATTRS(__kmpc_distribute_for_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           SExt, SExt, SExt, SExt))
+__OMP_RTL_ATTRS(__kmpc_distribute_for_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ZExt, ZExt, ZExt, ZExt))
+__OMP_RTL_ATTRS(__kmpc_distribute_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           SExt, SExt))
+__OMP_RTL_ATTRS(__kmpc_distribute_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ZExt, ZExt))
+__OMP_RTL_ATTRS(__kmpc_for_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           SExt, SExt, SExt))
+__OMP_RTL_ATTRS(__kmpc_for_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ZExt, ZExt, ZExt))
 __OMP_RTL_ATTRS(__kmpc_error, AttributeSet(), AttributeSet(),
                 ParamAttrs(AttributeSet(), SExt))
 __OMP_RTL_ATTRS(__kmpc_flush, BarrierAttrs, AttributeSet(),
diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -45,6 +45,9 @@ struct DynamicScheduleTracker {
 
 #pragma omp begin declare target device_type(nohost)
 
+extern int32_t __omp_rtl_assume_teams_oversubscription;
+extern int32_t __omp_rtl_assume_threads_oversubscription;
+
 // TODO: This variable is a hack inherited from the old runtime.
 static uint64_t SHARED(Cnt);
 
@@ -636,4 +639,255 @@ void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
 void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
 }
 
+namespace ompx {
+
+/// Helper class to hide the generic loop nest and provide the template argument
+/// throughout.
+template <typename Ty> class StaticLoopChunker {
+
+  /// Generic loop nest that handles block and/or thread distribution in the
+  /// absence of user specified chunk sizes. This implicitly picks a block chunk
+  /// size equal to the number of threads in the block and a thread chunk size
+  /// equal to one. In contrast to the chunked version we can get away with a
+  /// single loop in this case
+  static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg,
+                                        Ty NumBlocks, Ty BId, Ty NumThreads,
+                                        Ty TId, Ty NumIters,
+                                        bool OneIterationPerThread) {
+    Ty KernelIteration = NumBlocks * NumThreads;
+
+    // Start index in the normalized space.
+    Ty IV = BId * NumThreads + TId;
+    ASSERT(IV >= 0, "Bad index");
+
+    // Cover the entire iteration space, assumptions in the caller might allow
+    // to simplify this loop to a conditional.
+    if (IV < NumIters) {
+      do {
+
+        // Execute the loop body.
+        LoopBody(IV, Arg);
+
+        // Every thread executed one block and thread chunk now.
+        IV += KernelIteration;
+
+        if (OneIterationPerThread)
+          return;
+
+      } while (IV < NumIters);
+    }
+  }
+
+  /// Generic loop nest that handles block and/or thread distribution in the
+  /// presence of user specified chunk sizes (for at least one of them).
+  static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg,
+                                        Ty BlockChunk, Ty NumBlocks, Ty BId,
+                                        Ty ThreadChunk, Ty NumThreads, Ty TId,
+                                        Ty NumIters,
+                                        bool OneIterationPerThread) {
+    Ty KernelIteration = NumBlocks * BlockChunk;
+
+    // Start index in the chunked space.
+    Ty IV = BId * BlockChunk + TId;
+    ASSERT(IV >= 0, "Bad index");
+
+    // Cover the entire iteration space, assumptions in the caller might allow
+    // to simplify this loop to a conditional.
+    do {
+
+      Ty BlockChunkLeft =
+          BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
+      Ty ThreadChunkLeft =
+          ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
+
+      while (ThreadChunkLeft--) {
+
+        // Given the blocking it's hard to keep track of what to execute.
+        if (IV >= NumIters)
+          return;
+
+        // Execute the loop body.
+        LoopBody(IV, Arg);
+
+        if (OneIterationPerThread)
+          return;
+
+        ++IV;
+      }
+
+      IV += KernelIteration;
+
+    } while (IV < NumIters);
+  }
+
+public:
+  /// Worksharing `for`-loop.
+  static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
+                  Ty NumIters, Ty NumThreads, Ty ThreadChunk) {
+    ASSERT(NumIters >= 0, "Bad iteration count");
+    ASSERT(ThreadChunk >= 0, "Bad thread count");
+
+    // All threads need to participate but we don't know if we are in a
+    // parallel at all or if the user might have used a `num_threads` clause
+    // on the parallel and reduced the number compared to the block size.
+    // Since nested parallels are possible too we need to get the thread id
+    // from the `omp` getter and not the mapping directly.
+    Ty TId = omp_get_thread_num();
+
+    // There are no blocks involved here.
+    Ty BlockChunk = 0;
+    Ty NumBlocks = 1;
+    Ty BId = 0;
+
+    // If the thread chunk is not specified we pick a default now.
+    if (ThreadChunk == 0)
+      ThreadChunk = 1;
+
+    // If we know we have more threads than iterations we can indicate that to
+    // avoid an outer loop.
+    bool OneIterationPerThread = false;
+    if (__omp_rtl_assume_threads_oversubscription) {
+      ASSERT(NumThreads >= NumIters, "Broken assumption");
+      OneIterationPerThread = true;
+    }
+
+    if (ThreadChunk != 1)
+      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
+                                ThreadChunk, NumThreads, TId, NumIters,
+                                OneIterationPerThread);
+    else
+      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
+                                NumIters, OneIterationPerThread);
+  }
+
+  /// Worksharing `distrbute`-loop.
+  static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
+                         Ty NumIters, Ty BlockChunk) {
+    ASSERT(icv::Level == 0, "Bad distribute");
+    ASSERT(icv::ActiveLevel == 0, "Bad distribute");
+    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+    ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
+
+    ASSERT(NumIters >= 0, "Bad iteration count");
+    ASSERT(BlockChunk >= 0, "Bad block count");
+
+    // There are no threads involved here.
+    Ty ThreadChunk = 0;
+    Ty NumThreads = 1;
+    Ty TId = 0;
+    ASSERT(TId == mapping::getThreadIdInBlock(), "Bad thread id");
+
+    // All teams need to participate.
+    Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
+    Ty BId = mapping::getBlockIdInKernel();
+
+    // If the block chunk is not specified we pick a default now.
+    if (BlockChunk == 0)
+      BlockChunk = NumThreads;
+
+    // If we know we have more blocks than iterations we can indicate that to
+    // avoid an outer loop.
+    bool OneIterationPerThread = false;
+    if (__omp_rtl_assume_teams_oversubscription) {
+      ASSERT(NumBlocks >= NumIters, "Broken assumption");
+      OneIterationPerThread = true;
+    }
+
+    if (BlockChunk != NumThreads)
+      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
+                                ThreadChunk, NumThreads, TId, NumIters,
+                                OneIterationPerThread);
+    else
+      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
+                                NumIters, OneIterationPerThread);
+
+    ASSERT(icv::Level == 0, "Bad distribute");
+    ASSERT(icv::ActiveLevel == 0, "Bad distribute");
+    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+    ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
+  }
+
+  /// Worksharing `distrbute parallel for`-loop.
+  static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
+                            void *Arg, Ty NumIters, Ty NumThreads,
+                            Ty BlockChunk, Ty ThreadChunk) {
+    ASSERT(icv::Level == 1, "Bad distribute");
+    ASSERT(icv::ActiveLevel == 1, "Bad distribute");
+    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+
+    ASSERT(NumIters >= 0, "Bad iteration count");
+    ASSERT(BlockChunk >= 0, "Bad block count");
+    ASSERT(ThreadChunk >= 0, "Bad thread count");
+
+    // All threads need to participate but the user might have used a
+    // `num_threads` clause on the parallel and reduced the number compared to
+    // the block size.
+    Ty TId = mapping::getThreadIdInBlock();
+
+    // All teams need to participate.
+    Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
+    Ty BId = mapping::getBlockIdInKernel();
+
+    // If the block chunk is not specified we pick a default now.
+    if (BlockChunk == 0)
+      BlockChunk = NumThreads;
+
+    // If the thread chunk is not specified we pick a default now.
+    if (ThreadChunk == 0)
+      ThreadChunk = 1;
+
+    // If we know we have more threads (across all blocks) than iterations we
+    // can indicate that to avoid an outer loop.
+    bool OneIterationPerThread = false;
+    if (__omp_rtl_assume_teams_oversubscription &
+        __omp_rtl_assume_threads_oversubscription) {
+      OneIterationPerThread = true;
+      ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
+    }
+
+    if (BlockChunk != NumThreads || ThreadChunk != 1)
+      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
+                                ThreadChunk, NumThreads, TId, NumIters,
+                                OneIterationPerThread);
+    else
+      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
+                                NumIters, OneIterationPerThread);
+
+    ASSERT(icv::Level == 1, "Bad distribute");
+    ASSERT(icv::ActiveLevel == 1, "Bad distribute");
+    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+  }
+};
+
+} // namespace ompx
+
+#define OMP_LOOP_ENTRY(BW, TY)                                                 \
+  [[gnu::flatten, clang::always_inline]] void                                  \
+      __kmpc_distribute_for_static_loop##BW(                                   \
+          IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,       \
+          TY num_threads, TY block_chunk, TY thread_chunk) {                   \
+    ompx::StaticLoopChunker<TY>::DistributeFor(                                \
+        loc, fn, arg, num_iters + 1, num_threads, block_chunk, thread_chunk);  \
+  }                                                                            \
+  [[gnu::flatten, clang::always_inline]] void                                  \
+      __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *),  \
+                                        void *arg, TY num_iters,               \
+                                        TY block_chunk) {                      \
+    ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters + 1,       \
+                                            block_chunk);                      \
+  }                                                                            \
+  [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW(      \
+      IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
+      TY num_threads, TY thread_chunk) {                                       \
+    ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters + 1, num_threads, \
+                                     thread_chunk);                            \
+  }
+
+extern "C" {
+OMP_LOOP_ENTRY(_4, int32_t)
+OMP_LOOP_ENTRY(_4u, uint32_t)
+OMP_LOOP_ENTRY(_8, int64_t)
+OMP_LOOP_ENTRY(_8u, uint64_t)
+}
+
 #pragma omp end declare target