Skip to content

[OpenMP] New Openmp device RTL functions #73225

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,18 @@ __OMP_RTL(__kmpc_target_deinit, false, Void,)
__OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
__OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)
__OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
__OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
__OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
__OMP_RTL(__kmpc_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
__OMP_RTL(__kmpc_distribute_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32)
__OMP_RTL(__kmpc_distribute_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32)
__OMP_RTL(__kmpc_distribute_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64)
__OMP_RTL(__kmpc_distribute_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64)
__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to add the SignExt Attributes to the Int32 arguments.
Look for __OMP_RTL_ATTRS(__kmpc_cancel, below to see how,

__OMP_RTL(__kmpc_kernel_parallel, false, Int1, VoidPtrPtr)
__OMP_RTL(__kmpc_kernel_end_parallel, false, Void, )
__OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)
Expand Down Expand Up @@ -650,6 +662,24 @@ __OMP_RTL_ATTRS(__kmpc_cancel, InaccessibleArgOnlyAttrs, SExt,
ParamAttrs(ReadOnlyPtrAttrs, SExt, SExt))
__OMP_RTL_ATTRS(__kmpc_cancel_barrier, BarrierAttrs, SExt,
ParamAttrs(ReadOnlyPtrAttrs, SExt))
__OMP_RTL_ATTRS(__kmpc_distribute_for_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
SExt, SExt, SExt, SExt))
__OMP_RTL_ATTRS(__kmpc_distribute_for_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
ZExt, ZExt, ZExt, ZExt))
__OMP_RTL_ATTRS(__kmpc_distribute_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
SExt, SExt))
__OMP_RTL_ATTRS(__kmpc_distribute_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
ZExt, ZExt))
__OMP_RTL_ATTRS(__kmpc_for_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
SExt, SExt, SExt))
__OMP_RTL_ATTRS(__kmpc_for_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
ZExt, ZExt, ZExt))
__OMP_RTL_ATTRS(__kmpc_error, AttributeSet(), AttributeSet(),
ParamAttrs(AttributeSet(), SExt))
__OMP_RTL_ATTRS(__kmpc_flush, BarrierAttrs, AttributeSet(),
Expand Down
254 changes: 254 additions & 0 deletions openmp/libomptarget/DeviceRTL/src/Workshare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ struct DynamicScheduleTracker {

#pragma omp begin declare target device_type(nohost)

extern int32_t __omp_rtl_assume_teams_oversubscription;
extern int32_t __omp_rtl_assume_threads_oversubscription;

// TODO: This variable is a hack inherited from the old runtime.
static uint64_t SHARED(Cnt);

Expand Down Expand Up @@ -636,4 +639,255 @@ void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
}

namespace ompx {

/// Helper class to hide the generic loop nest and provide the template argument
/// throughout.
template <typename Ty> class StaticLoopChunker {

/// Generic loop nest that handles block and/or thread distribution in the
/// absence of user specified chunk sizes. This implicitly picks a block chunk
/// size equal to the number of threads in the block and a thread chunk size
/// equal to one. In contrast to the chunked version we can get away with a
/// single loop in this case
static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg,
Ty NumBlocks, Ty BId, Ty NumThreads,
Ty TId, Ty NumIters,
bool OneIterationPerThread) {
Ty KernelIteration = NumBlocks * NumThreads;

// Start index in the normalized space.
Ty IV = BId * NumThreads + TId;
ASSERT(IV >= 0, "Bad index");

// Cover the entire iteration space, assumptions in the caller might allow
// to simplify this loop to a conditional.
if (IV < NumIters) {
do {

// Execute the loop body.
LoopBody(IV, Arg);

// Every thread executed one block and thread chunk now.
IV += KernelIteration;

if (OneIterationPerThread)
return;

} while (IV < NumIters);
}
}

/// Generic loop nest that handles block and/or thread distribution in the
/// presence of user specified chunk sizes (for at least one of them).
static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg,
Ty BlockChunk, Ty NumBlocks, Ty BId,
Ty ThreadChunk, Ty NumThreads, Ty TId,
Ty NumIters,
bool OneIterationPerThread) {
Ty KernelIteration = NumBlocks * BlockChunk;

// Start index in the chunked space.
Ty IV = BId * BlockChunk + TId;
ASSERT(IV >= 0, "Bad index");

// Cover the entire iteration space, assumptions in the caller might allow
// to simplify this loop to a conditional.
do {

Ty BlockChunkLeft =
BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
Ty ThreadChunkLeft =
ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;

while (ThreadChunkLeft--) {

// Given the blocking it's hard to keep track of what to execute.
if (IV >= NumIters)
return;

// Execute the loop body.
LoopBody(IV, Arg);

if (OneIterationPerThread)
return;

++IV;
}

IV += KernelIteration;

} while (IV < NumIters);
}

public:
/// Worksharing `for`-loop.
static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
Ty NumIters, Ty NumThreads, Ty ThreadChunk) {
ASSERT(NumIters >= 0, "Bad iteration count");
ASSERT(ThreadChunk >= 0, "Bad thread count");

// All threads need to participate but we don't know if we are in a
// parallel at all or if the user might have used a `num_threads` clause
// on the parallel and reduced the number compared to the block size.
// Since nested parallels are possible too we need to get the thread id
// from the `omp` getter and not the mapping directly.
Ty TId = omp_get_thread_num();

// There are no blocks involved here.
Ty BlockChunk = 0;
Ty NumBlocks = 1;
Ty BId = 0;

// If the thread chunk is not specified we pick a default now.
if (ThreadChunk == 0)
ThreadChunk = 1;

// If we know we have more threads than iterations we can indicate that to
// avoid an outer loop.
bool OneIterationPerThread = false;
if (__omp_rtl_assume_threads_oversubscription) {
ASSERT(NumThreads >= NumIters, "Broken assumption");
OneIterationPerThread = true;
}

if (ThreadChunk != 1)
NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
ThreadChunk, NumThreads, TId, NumIters,
OneIterationPerThread);
else
NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
NumIters, OneIterationPerThread);
}

/// Worksharing `distrbute`-loop.
static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
Ty NumIters, Ty BlockChunk) {
ASSERT(icv::Level == 0, "Bad distribute");
ASSERT(icv::ActiveLevel == 0, "Bad distribute");
ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
ASSERT(state::ParallelTeamSize == 1, "Bad distribute");

ASSERT(NumIters >= 0, "Bad iteration count");
ASSERT(BlockChunk >= 0, "Bad block count");

// There are no threads involved here.
Ty ThreadChunk = 0;
Ty NumThreads = 1;
Ty TId = 0;
ASSERT(TId == mapping::getThreadIdInBlock(), "Bad thread id");

// All teams need to participate.
Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
Ty BId = mapping::getBlockIdInKernel();

// If the block chunk is not specified we pick a default now.
if (BlockChunk == 0)
BlockChunk = NumThreads;

// If we know we have more blocks than iterations we can indicate that to
// avoid an outer loop.
bool OneIterationPerThread = false;
if (__omp_rtl_assume_teams_oversubscription) {
ASSERT(NumBlocks >= NumIters, "Broken assumption");
OneIterationPerThread = true;
}

if (BlockChunk != NumThreads)
NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
ThreadChunk, NumThreads, TId, NumIters,
OneIterationPerThread);
else
NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
NumIters, OneIterationPerThread);

ASSERT(icv::Level == 0, "Bad distribute");
ASSERT(icv::ActiveLevel == 0, "Bad distribute");
ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
}

/// Worksharing `distrbute parallel for`-loop.
static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
void *Arg, Ty NumIters, Ty NumThreads,
Ty BlockChunk, Ty ThreadChunk) {
ASSERT(icv::Level == 1, "Bad distribute");
ASSERT(icv::ActiveLevel == 1, "Bad distribute");
ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");

ASSERT(NumIters >= 0, "Bad iteration count");
ASSERT(BlockChunk >= 0, "Bad block count");
ASSERT(ThreadChunk >= 0, "Bad thread count");

// All threads need to participate but the user might have used a
// `num_threads` clause on the parallel and reduced the number compared to
// the block size.
Ty TId = mapping::getThreadIdInBlock();

// All teams need to participate.
Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
Ty BId = mapping::getBlockIdInKernel();

// If the block chunk is not specified we pick a default now.
if (BlockChunk == 0)
BlockChunk = NumThreads;

// If the thread chunk is not specified we pick a default now.
if (ThreadChunk == 0)
ThreadChunk = 1;

// If we know we have more threads (across all blocks) than iterations we
// can indicate that to avoid an outer loop.
bool OneIterationPerThread = false;
if (__omp_rtl_assume_teams_oversubscription &
__omp_rtl_assume_threads_oversubscription) {
OneIterationPerThread = true;
ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
}

if (BlockChunk != NumThreads || ThreadChunk != 1)
NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
ThreadChunk, NumThreads, TId, NumIters,
OneIterationPerThread);
else
NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
NumIters, OneIterationPerThread);

ASSERT(icv::Level == 1, "Bad distribute");
ASSERT(icv::ActiveLevel == 1, "Bad distribute");
ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
}
};

} // namespace ompx

#define OMP_LOOP_ENTRY(BW, TY) \
[[gnu::flatten, clang::always_inline]] void \
__kmpc_distribute_for_static_loop##BW( \
IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
TY num_threads, TY block_chunk, TY thread_chunk) { \
ompx::StaticLoopChunker<TY>::DistributeFor( \
loc, fn, arg, num_iters + 1, num_threads, block_chunk, thread_chunk); \
} \
[[gnu::flatten, clang::always_inline]] void \
__kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \
void *arg, TY num_iters, \
TY block_chunk) { \
ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters + 1, \
block_chunk); \
} \
[[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \
IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
TY num_threads, TY thread_chunk) { \
ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters + 1, num_threads, \
thread_chunk); \
}

extern "C" {
OMP_LOOP_ENTRY(_4, int32_t)
OMP_LOOP_ENTRY(_4u, uint32_t)
OMP_LOOP_ENTRY(_8, int64_t)
OMP_LOOP_ENTRY(_8u, uint64_t)
}

#pragma omp end declare target