-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[flang][cuda] Sync double descriptor after c_f_pointer call #130194
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@llvm/pr-subscribers-flang-fir-hlfir Author: Valentin Clement (バレンタイン クレメン) (clementval) ChangesAfter a global device pointer is set through Full diff: https://github.com/llvm/llvm-project/pull/130194.diff 12 Files Affected:
diff --git a/flang/include/flang/Lower/Cuda.h b/flang/include/flang/Lower/Cuda.h
index d97045383d195..b6f849e3d63f0 100644
--- a/flang/include/flang/Lower/Cuda.h
+++ b/flang/include/flang/Lower/Cuda.h
@@ -20,27 +20,6 @@
#include "mlir/Dialect/OpenACC/OpenACC.h"
namespace Fortran::lower {
-// Check if the insertion point is currently in a device context. HostDevice
-// subprogram are not considered fully device context so it will return false
-// for it.
-// If the insertion point is inside an OpenACC region op, it is considered
-// device context.
-static bool inline isCudaDeviceContext(fir::FirOpBuilder &builder) {
- if (builder.getRegion().getParentOfType<cuf::KernelOp>())
- return true;
- if (builder.getRegion()
- .getParentOfType<mlir::acc::ComputeRegionOpInterface>())
- return true;
- if (auto funcOp = builder.getRegion().getParentOfType<mlir::func::FuncOp>()) {
- if (auto cudaProcAttr =
- funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
- cuf::getProcAttrName())) {
- return cudaProcAttr.getValue() != cuf::ProcAttribute::Host &&
- cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
- }
- }
- return false;
-}
static inline unsigned getAllocatorIdx(const Fortran::semantics::Symbol &sym) {
std::optional<Fortran::common::CUDADataAttr> cudaAttr =
diff --git a/flang/include/flang/Optimizer/Builder/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h
index b99e330429622..e3c7b5098b83f 100644
--- a/flang/include/flang/Optimizer/Builder/CUFCommon.h
+++ b/flang/include/flang/Optimizer/Builder/CUFCommon.h
@@ -25,8 +25,10 @@ namespace cuf {
mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod,
mlir::SymbolTable &symTab);
-bool isInCUDADeviceContext(mlir::Operation *op);
+bool isCUDADeviceContext(mlir::Operation *op);
+bool isCUDADeviceContext(mlir::Region &);
bool isRegisteredDeviceGlobal(fir::GlobalOp op);
+bool isRegisteredDeviceAttr(std::optional<cuf::DataAttribute> attr);
void genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder);
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h b/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
new file mode 100644
index 0000000000000..14d262bf22a70
--- /dev/null
+++ b/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
@@ -0,0 +1,31 @@
+//===-- Descriptor.h - CUDA descritpor runtime API calls --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_
+#define FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_
+
+#include "mlir/IR/Value.h"
+
+namespace mlir {
+class Location;
+} // namespace mlir
+
+namespace fir {
+class FirOpBuilder;
+}
+
+namespace fir::runtime::cuda {
+
+/// Generate runtime call to sync the doublce descriptor referenced by
+/// \p hostPtr.
+void genSyncGlobalDescriptor(fir::FirOpBuilder &builder, mlir::Location loc,
+ mlir::Value hostPtr);
+
+} // namespace fir::runtime::cuda
+
+#endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 3d21e7a3fa8d5..9938bd573d1fa 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -470,7 +470,7 @@ class AllocateStmtHelper {
void genSimpleAllocation(const Allocation &alloc,
const fir::MutableBoxValue &box) {
bool isCudaSymbol = Fortran::semantics::HasCUDAAttr(alloc.getSymbol());
- bool isCudaDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
+ bool isCudaDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());
bool inlineAllocation = !box.isDerived() && !errorManager.hasStatSpec() &&
!alloc.type.IsPolymorphic() &&
!alloc.hasCoarraySpec() && !useAllocateRuntime &&
@@ -862,7 +862,7 @@ genDeallocate(fir::FirOpBuilder &builder,
mlir::Value declaredTypeDesc = {},
const Fortran::semantics::Symbol *symbol = nullptr) {
bool isCudaSymbol = symbol && Fortran::semantics::HasCUDAAttr(*symbol);
- bool isCudaDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
+ bool isCudaDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());
bool inlineDeallocation =
!box.isDerived() && !box.isPolymorphic() && !box.hasAssumedRank() &&
!box.isUnlimitedPolymorphic() && !errorManager.hasStatSpec() &&
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 95f431983d442..e368974c92a3e 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -4689,7 +4689,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
mlir::Location loc = getCurrentLocation();
fir::FirOpBuilder &builder = getFirOpBuilder();
- bool isInDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
+ bool isInDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());
bool isCUDATransfer =
IsCUDADataTransfer(assign.lhs, assign.rhs) && !isInDeviceContext;
diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt
index f0563d092e3dc..31ae395805faf 100644
--- a/flang/lib/Optimizer/Builder/CMakeLists.txt
+++ b/flang/lib/Optimizer/Builder/CMakeLists.txt
@@ -18,6 +18,7 @@ add_flang_library(FIRBuilder
Runtime/Assign.cpp
Runtime/Character.cpp
Runtime/Command.cpp
+ Runtime/CUDA/Descriptor.cpp
Runtime/Derived.cpp
Runtime/EnvironmentDefaults.cpp
Runtime/Exceptions.cpp
diff --git a/flang/lib/Optimizer/Builder/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp
index 39848205f47af..5f286c04a7ca0 100644
--- a/flang/lib/Optimizer/Builder/CUFCommon.cpp
+++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp
@@ -12,6 +12,7 @@
#include "flang/Optimizer/HLFIR/HLFIROps.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
/// Retrieve or create the CUDA Fortran GPU module in the give in \p mod.
mlir::gpu::GPUModuleOp cuf::getOrCreateGPUModule(mlir::ModuleOp mod,
@@ -31,25 +32,34 @@ mlir::gpu::GPUModuleOp cuf::getOrCreateGPUModule(mlir::ModuleOp mod,
return gpuMod;
}
-bool cuf::isInCUDADeviceContext(mlir::Operation *op) {
- if (!op)
+bool cuf::isCUDADeviceContext(mlir::Operation *op) {
+ if (!op || !op->getParentRegion())
return false;
- if (op->getParentOfType<cuf::KernelOp>() ||
- op->getParentOfType<mlir::gpu::GPUFuncOp>())
+ return isCUDADeviceContext(*op->getParentRegion());
+}
+
+// Check if the insertion point is currently in a device context. HostDevice
+// subprogram are not considered fully device context so it will return false
+// for it.
+// If the insertion point is inside an OpenACC region op, it is considered
+// device context.
+bool cuf::isCUDADeviceContext(mlir::Region ®ion) {
+ if (region.getParentOfType<cuf::KernelOp>())
+ return true;
+ if (region.getParentOfType<mlir::acc::ComputeRegionOpInterface>())
return true;
- if (auto funcOp = op->getParentOfType<mlir::func::FuncOp>()) {
- if (auto cudaProcAttr = funcOp->getAttrOfType<cuf::ProcAttributeAttr>(
- cuf::getProcAttrName())) {
- return cudaProcAttr.getValue() != cuf::ProcAttribute::Host;
+ if (auto funcOp = region.getParentOfType<mlir::func::FuncOp>()) {
+ if (auto cudaProcAttr =
+ funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
+ cuf::getProcAttrName())) {
+ return cudaProcAttr.getValue() != cuf::ProcAttribute::Host &&
+ cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
}
}
return false;
}
-bool cuf::isRegisteredDeviceGlobal(fir::GlobalOp op) {
- if (op.getConstant())
- return false;
- auto attr = op.getDataAttr();
+bool cuf::isRegisteredDeviceAttr(std::optional<cuf::DataAttribute> attr) {
if (attr && (*attr == cuf::DataAttribute::Device ||
*attr == cuf::DataAttribute::Managed ||
*attr == cuf::DataAttribute::Constant))
@@ -57,6 +67,12 @@ bool cuf::isRegisteredDeviceGlobal(fir::GlobalOp op) {
return false;
}
+bool cuf::isRegisteredDeviceGlobal(fir::GlobalOp op) {
+ if (op.getConstant())
+ return false;
+ return isRegisteredDeviceAttr(op.getDataAttr());
+}
+
void cuf::genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder) {
if (auto declareOp = box.getDefiningOp<hlfir::DeclareOp>()) {
if (auto addrOfOp = declareOp.getMemref().getDefiningOp<fir::AddrOfOp>()) {
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index c52b0cbaf2019..8370e82c10b67 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -16,12 +16,14 @@
#include "flang/Optimizer/Builder/IntrinsicCall.h"
#include "flang/Common/static-multimap-view.h"
#include "flang/Optimizer/Builder/BoxValue.h"
+#include "flang/Optimizer/Builder/CUFCommon.h"
#include "flang/Optimizer/Builder/Character.h"
#include "flang/Optimizer/Builder/Complex.h"
#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Builder/MutableBox.h"
#include "flang/Optimizer/Builder/PPCIntrinsicCall.h"
#include "flang/Optimizer/Builder/Runtime/Allocatable.h"
+#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
#include "flang/Optimizer/Builder/Runtime/Character.h"
#include "flang/Optimizer/Builder/Runtime/Command.h"
#include "flang/Optimizer/Builder/Runtime/Derived.h"
@@ -38,6 +40,7 @@
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/Dialect/FIROpsSupport.h"
#include "flang/Optimizer/Dialect/Support/FIRContext.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
#include "flang/Optimizer/Support/FatalError.h"
#include "flang/Optimizer/Support/Utils.h"
#include "flang/Runtime/entry-names.h"
@@ -3254,6 +3257,16 @@ void IntrinsicLibrary::genCFPointer(llvm::ArrayRef<fir::ExtendedValue> args) {
fir::factory::associateMutableBox(builder, loc, *fPtr, getCPtrExtVal(*fPtr),
/*lbounds=*/mlir::ValueRange{});
+
+ // If the pointer is a registered CUDA fortran variable, the descriptor needs
+ // to be synced.
+ if (auto declare = mlir::dyn_cast_or_null<hlfir::DeclareOp>(
+ fPtr->getAddr().getDefiningOp()))
+ if (mlir::isa<fir::AddrOfOp>(declare.getMemref().getDefiningOp()))
+ if (cuf::isRegisteredDeviceAttr(declare.getDataAttr()) &&
+ !cuf::isCUDADeviceContext(builder.getRegion()))
+ fir::runtime::cuda::genSyncGlobalDescriptor(builder, loc,
+ declare.getMemref());
}
// C_F_PROCPOINTER
diff --git a/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp b/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
new file mode 100644
index 0000000000000..90662c094c65e
--- /dev/null
+++ b/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
@@ -0,0 +1,34 @@
+
+//===-- Allocatable.cpp -- Allocatable statements lowering ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
+#include "flang/Runtime/CUDA/descriptor.h"
+
+using namespace Fortran::runtime::cuda;
+
+void fir::runtime::cuda::genSyncGlobalDescriptor(fir::FirOpBuilder &builder,
+ mlir::Location loc,
+ mlir::Value hostPtr) {
+ mlir::func::FuncOp callee =
+ fir::runtime::getRuntimeFunc<mkRTKey(CUFSyncGlobalDescriptor)>(loc,
+ builder);
+ auto fTy = callee.getFunctionType();
+ mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+ mlir::Value sourceLine =
+ fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
+ llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+ builder, loc, fTy, hostPtr, sourceFile, sourceLine)};
+ builder.create<fir::CallOp>(loc, callee, args);
+}
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 2ab2d84f1643d..0fbec8a204b8d 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -8,6 +8,7 @@
#include "flang/Optimizer/Transforms/CUFOpConversion.h"
#include "flang/Optimizer/Builder/CUFCommon.h"
+#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
#include "flang/Optimizer/CodeGen/TypeConverter.h"
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
@@ -904,16 +905,7 @@ struct CUFSyncDescriptorOpConversion
auto hostAddr = builder.create<fir::AddrOfOp>(
loc, fir::ReferenceType::get(globalOp.getType()), op.getGlobalName());
- mlir::func::FuncOp callee =
- fir::runtime::getRuntimeFunc<mkRTKey(CUFSyncGlobalDescriptor)>(loc,
- builder);
- auto fTy = callee.getFunctionType();
- mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
- mlir::Value sourceLine =
- fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
- llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
- builder, loc, fTy, hostAddr, sourceFile, sourceLine)};
- builder.create<fir::CallOp>(loc, callee, args);
+ fir::runtime::cuda::genSyncGlobalDescriptor(builder, loc, hostAddr);
op.erase();
return mlir::success();
}
diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
index df2887ff1422e..2484f4f6b99a4 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
@@ -1277,7 +1277,7 @@ void SimplifyIntrinsicsPass::runOnOperation() {
fir::KindMapping kindMap = fir::getKindMapping(module);
module.walk([&](mlir::Operation *op) {
if (auto call = mlir::dyn_cast<fir::CallOp>(op)) {
- if (cuf::isInCUDADeviceContext(op))
+ if (cuf::isCUDADeviceContext(op))
return;
if (mlir::SymbolRefAttr callee = call.getCalleeAttr()) {
mlir::StringRef funcName = callee.getLeafReference().getValue();
diff --git a/flang/test/Lower/CUDA/cuda-pointer.cuf b/flang/test/Lower/CUDA/cuda-pointer.cuf
index 2a9dbe54c2922..e9614751673e0 100644
--- a/flang/test/Lower/CUDA/cuda-pointer.cuf
+++ b/flang/test/Lower/CUDA/cuda-pointer.cuf
@@ -2,10 +2,31 @@
! Test lowering of CUDA pointers.
+module mod1
+
+integer, device, pointer :: x(:)
+
+contains
+
subroutine allocate_pointer
real, device, pointer :: pr(:)
allocate(pr(10))
end
-! CHECK-LABEL: func.func @_QPallocate_pointer()
+! CHECK-LABEL: func.func @_QMmod1Pallocate_pointer()
! CHECK-COUNT-2: fir.embox %{{.*}} {allocator_idx = 2 : i32} : (!fir.ptr<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
+
+subroutine c_f_pointer_sync
+ use iso_c_binding
+ use, intrinsic :: __fortran_builtins, only: c_devptr => __builtin_c_devptr
+ type(c_devptr) :: cd1
+ integer, parameter :: N = 2000
+ call c_f_pointer(cd1, x, (/ 2000 /))
+end
+
+! CHECK-LABEL: func.func @_QMmod1Pc_f_pointer_sync()
+! CHECK: %[[ADDR_X:.*]] = fir.address_of(@_QMmod1Ex) : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+! CHECK: %[[CONV:.*]] = fir.convert %[[ADDR_X]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<i8>
+! CHECK: fir.call @_FortranACUFSyncGlobalDescriptor(%[[CONV]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.ref<i8>, i32) -> ()
+
+end module
|
wangzpgi
approved these changes
Mar 6, 2025
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
After a global device pointer is set through
c_f_pointer
, we need to sync the double descriptor so the version on the device is also up to date.