Skip to content

Commit 460840c

Browse files
authored
[OpenMP] Support 'omp_get_num_procs' on the device (#65501)
Summary: The `omp_get_num_procs()` function should return the amount of parallelism availible. On the GPU, this was not defined. We have elected to define this function as the maximum amount of wavefronts / warps that can be simultaneously resident on the device. For AMDGPU this is the number of CUs multiplied byth CU's per wave. For NVPTX this is the maximum threads per SM divided by the warp size and multiplied by the number of SMs.
1 parent 6f38713 commit 460840c

File tree

9 files changed

+52
-10
lines changed

9 files changed

+52
-10
lines changed

openmp/libomptarget/DeviceRTL/include/Configuration.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ void *getIndirectCallTablePtr();
4646
/// Returns the size of the indirect call table.
4747
uint64_t getIndirectCallTableSize();
4848

49+
/// Returns the size of the indirect call table.
50+
uint64_t getHardwareParallelism();
51+
4952
/// Return if debugging is enabled for the given debug kind.
5053
bool isDebugMode(DebugKind Level);
5154

openmp/libomptarget/DeviceRTL/src/Configuration.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ void *config::getIndirectCallTablePtr() {
5555
__omp_rtl_device_environment.IndirectCallTable);
5656
}
5757

58+
uint64_t config::getHardwareParallelism() {
59+
return __omp_rtl_device_environment.HardwareParallelism;
60+
}
61+
5862
uint64_t config::getIndirectCallTableSize() {
5963
return __omp_rtl_device_environment.IndirectCallTableSize;
6064
}

openmp/libomptarget/DeviceRTL/src/Mapping.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,9 @@ uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
333333
return NumberOfBlocks;
334334
}
335335

336-
uint32_t mapping::getNumberOfProcessorElements() { __builtin_trap(); }
336+
uint32_t mapping::getNumberOfProcessorElements() {
337+
return static_cast<uint32_t>(config::getHardwareParallelism());
338+
}
337339

338340
///}
339341

openmp/libomptarget/include/Environment.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ struct DeviceEnvironmentTy {
3333
uint64_t ClockFrequency;
3434
uintptr_t IndirectCallTable;
3535
uint64_t IndirectCallTableSize;
36+
uint64_t HardwareParallelism;
3637
};
3738

3839
// NOTE: Please don't change the order of those members as their indices are

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1942,16 +1942,21 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
19421942
/// AMDGPU devices do not have the concept of contexts.
19431943
Error setContext() override { return Plugin::success(); }
19441944

1945+
/// AMDGPU returns the product of the number of compute units and the waves
1946+
/// per compute unit.
1947+
uint64_t getHardwareParallelism() const override {
1948+
return HardwareParallelism;
1949+
}
1950+
19451951
/// We want to set up the RPC server for host services to the GPU if it is
19461952
/// availible.
19471953
bool shouldSetupRPCServer() const override {
19481954
return libomptargetSupportsRPC();
19491955
}
19501956

1951-
/// AMDGPU returns the product of the number of compute units and the waves
1952-
/// per compute unit.
1953-
uint64_t requestedRPCPortCount() const override {
1954-
return HardwareParallelism;
1957+
/// The RPC interface should have enough space for all availible parallelism.
1958+
uint64_t requestedRPCPortCount() const override {
1959+
return getHardwareParallelism();
19551960
}
19561961

19571962
/// Get the stream of the asynchronous info sructure or get a new one.

openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -692,6 +692,7 @@ Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
692692
DeviceEnvironment.IndirectCallTable =
693693
reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);
694694
DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second;
695+
DeviceEnvironment.HardwareParallelism = getHardwareParallelism();
695696

696697
// Create the metainfo of the device environment global.
697698
GlobalTy DevEnvGlobal("__omp_rtl_device_environment",

openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -781,6 +781,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
781781
return OMPX_MinThreadsForLowTripCount;
782782
}
783783

784+
/// Get the total amount of hardware parallelism supported by the target
785+
/// device. This is the total amount of warps or wavefronts that can be
786+
/// resident on the device simultaneously.
787+
virtual uint64_t getHardwareParallelism() const { return 0; }
788+
784789
/// Get the RPC server running on this device.
785790
RPCServerTy *getRPCServer() const { return RPCServer; }
786791

openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -301,8 +301,9 @@ struct CUDADeviceTy : public GenericDeviceTy {
301301
if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
302302
NumMuliprocessors))
303303
return Err;
304-
if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
305-
MaxThreadsPerSM))
304+
if (auto Err =
305+
getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
306+
MaxThreadsPerSM))
306307
return Err;
307308
if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, WarpSize))
308309
return Err;
@@ -373,16 +374,21 @@ struct CUDADeviceTy : public GenericDeviceTy {
373374
return Plugin::check(Res, "Error in cuCtxSetCurrent: %s");
374375
}
375376

377+
/// NVIDIA returns the product of the SM count and the number of warps that
378+
/// fit if the maximum number of threads were scheduled on each SM.
379+
uint64_t getHardwareParallelism() const override {
380+
return HardwareParallelism;
381+
}
382+
376383
/// We want to set up the RPC server for host services to the GPU if it is
377384
/// availible.
378385
bool shouldSetupRPCServer() const override {
379386
return libomptargetSupportsRPC();
380387
}
381388

382-
/// NVIDIA returns the product of the SM count and the number of warps that
383-
/// fit if the maximum number of threads were scheduled on each SM.
389+
/// The RPC interface should have enough space for all availible parallelism.
384390
uint64_t requestedRPCPortCount() const override {
385-
return HardwareParallelism;
391+
return getHardwareParallelism();
386392
}
387393

388394
/// Get the stream of the asynchronous info sructure or get a new one.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
// RUN: %libomptarget-compile-run-and-check-generic
2+
3+
#include <stdio.h>
4+
5+
int omp_get_num_procs();
6+
7+
int main() {
8+
int num_procs;
9+
#pragma omp target map(from : num_procs)
10+
{ num_procs = omp_get_num_procs(); }
11+
12+
// CHECK: PASS
13+
if (num_procs > 0)
14+
printf("PASS\n");
15+
}

0 commit comments

Comments
 (0)