Skip to content

[OpenMP] Support 'omp_get_num_procs' on the device #65501

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions openmp/libomptarget/DeviceRTL/include/Configuration.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ void *getIndirectCallTablePtr();
/// Returns the size of the indirect call table.
uint64_t getIndirectCallTableSize();

/// Returns the size of the indirect call table.
uint64_t getHardwareParallelism();

/// Return if debugging is enabled for the given debug kind.
bool isDebugMode(DebugKind Level);

Expand Down
4 changes: 4 additions & 0 deletions openmp/libomptarget/DeviceRTL/src/Configuration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ void *config::getIndirectCallTablePtr() {
__omp_rtl_device_environment.IndirectCallTable);
}

uint64_t config::getHardwareParallelism() {
return __omp_rtl_device_environment.HardwareParallelism;
}

uint64_t config::getIndirectCallTableSize() {
return __omp_rtl_device_environment.IndirectCallTableSize;
}
Expand Down
4 changes: 3 additions & 1 deletion openmp/libomptarget/DeviceRTL/src/Mapping.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,9 @@ uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
return NumberOfBlocks;
}

uint32_t mapping::getNumberOfProcessorElements() { __builtin_trap(); }
uint32_t mapping::getNumberOfProcessorElements() {
return static_cast<uint32_t>(config::getHardwareParallelism());
}

///}

Expand Down
1 change: 1 addition & 0 deletions openmp/libomptarget/include/Environment.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ struct DeviceEnvironmentTy {
uint64_t ClockFrequency;
uintptr_t IndirectCallTable;
uint64_t IndirectCallTableSize;
uint64_t HardwareParallelism;
};

// NOTE: Please don't change the order of those members as their indices are
Expand Down
13 changes: 9 additions & 4 deletions openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1942,16 +1942,21 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// AMDGPU devices do not have the concept of contexts.
Error setContext() override { return Plugin::success(); }

/// AMDGPU returns the product of the number of compute units and the waves
/// per compute unit.
uint64_t getHardwareParallelism() const override {
return HardwareParallelism;
}

/// We want to set up the RPC server for host services to the GPU if it is
/// availible.
bool shouldSetupRPCServer() const override {
return libomptargetSupportsRPC();
}

/// AMDGPU returns the product of the number of compute units and the waves
/// per compute unit.
uint64_t requestedRPCPortCount() const override {
return HardwareParallelism;
/// The RPC interface should have enough space for all availible parallelism.
uint64_t requestedRPCPortCount() const override {
return getHardwareParallelism();
}

/// Get the stream of the asynchronous info sructure or get a new one.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,7 @@ Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
DeviceEnvironment.IndirectCallTable =
reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);
DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second;
DeviceEnvironment.HardwareParallelism = getHardwareParallelism();

// Create the metainfo of the device environment global.
GlobalTy DevEnvGlobal("__omp_rtl_device_environment",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
return OMPX_MinThreadsForLowTripCount;
}

/// Get the total amount of hardware parallelism supported by the target
/// device. This is the total amount of warps or wavefronts that can be
/// resident on the device simultaneously.
virtual uint64_t getHardwareParallelism() const { return 0; }

/// Get the RPC server running on this device.
RPCServerTy *getRPCServer() const { return RPCServer; }

Expand Down
16 changes: 11 additions & 5 deletions openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -301,8 +301,9 @@ struct CUDADeviceTy : public GenericDeviceTy {
if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
NumMuliprocessors))
return Err;
if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
MaxThreadsPerSM))
if (auto Err =
getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
MaxThreadsPerSM))
return Err;
if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, WarpSize))
return Err;
Expand Down Expand Up @@ -373,16 +374,21 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Plugin::check(Res, "Error in cuCtxSetCurrent: %s");
}

/// NVIDIA returns the product of the SM count and the number of warps that
/// fit if the maximum number of threads were scheduled on each SM.
uint64_t getHardwareParallelism() const override {
return HardwareParallelism;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where is this value set?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is borrowing it from a previous patch that was added for the RPC support. It's currently set at line 309.

}

/// We want to set up the RPC server for host services to the GPU if it is
/// availible.
bool shouldSetupRPCServer() const override {
return libomptargetSupportsRPC();
}

/// NVIDIA returns the product of the SM count and the number of warps that
/// fit if the maximum number of threads were scheduled on each SM.
/// The RPC interface should have enough space for all availible parallelism.
uint64_t requestedRPCPortCount() const override {
return HardwareParallelism;
return getHardwareParallelism();
}

/// Get the stream of the asynchronous info sructure or get a new one.
Expand Down
15 changes: 15 additions & 0 deletions openmp/libomptarget/test/api/omp_get_num_procs.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// RUN: %libomptarget-compile-run-and-check-generic
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You might want to require certain targets otherwise the test will fail since by default it returns 0.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For x86 offloading this should use libomp.so's implementation, which should be supported.


#include <stdio.h>

int omp_get_num_procs();

int main() {
int num_procs;
#pragma omp target map(from : num_procs)
{ num_procs = omp_get_num_procs(); }

// CHECK: PASS
if (num_procs > 0)
printf("PASS\n");
}