Skip to content

[flang][cuda] Sync double descriptor after c_f_pointer call #130194

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 0 additions & 21 deletions flang/include/flang/Lower/Cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,27 +20,6 @@
#include "mlir/Dialect/OpenACC/OpenACC.h"

namespace Fortran::lower {
// Check if the insertion point is currently in a device context. HostDevice
// subprogram are not considered fully device context so it will return false
// for it.
// If the insertion point is inside an OpenACC region op, it is considered
// device context.
static bool inline isCudaDeviceContext(fir::FirOpBuilder &builder) {
if (builder.getRegion().getParentOfType<cuf::KernelOp>())
return true;
if (builder.getRegion()
.getParentOfType<mlir::acc::ComputeRegionOpInterface>())
return true;
if (auto funcOp = builder.getRegion().getParentOfType<mlir::func::FuncOp>()) {
if (auto cudaProcAttr =
funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
cuf::getProcAttrName())) {
return cudaProcAttr.getValue() != cuf::ProcAttribute::Host &&
cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
}
}
return false;
}

static inline unsigned getAllocatorIdx(const Fortran::semantics::Symbol &sym) {
std::optional<Fortran::common::CUDADataAttr> cudaAttr =
Expand Down
4 changes: 3 additions & 1 deletion flang/include/flang/Optimizer/Builder/CUFCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ namespace cuf {
mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod,
mlir::SymbolTable &symTab);

bool isInCUDADeviceContext(mlir::Operation *op);
bool isCUDADeviceContext(mlir::Operation *op);
bool isCUDADeviceContext(mlir::Region &);
bool isRegisteredDeviceGlobal(fir::GlobalOp op);
bool isRegisteredDeviceAttr(std::optional<cuf::DataAttribute> attr);

void genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder);

Expand Down
31 changes: 31 additions & 0 deletions flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
//===-- Descriptor.h - CUDA descritpor runtime API calls --------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_
#define FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_

#include "mlir/IR/Value.h"

namespace mlir {
class Location;
} // namespace mlir

namespace fir {
class FirOpBuilder;
}

namespace fir::runtime::cuda {

/// Generate runtime call to sync the doublce descriptor referenced by
/// \p hostPtr.
void genSyncGlobalDescriptor(fir::FirOpBuilder &builder, mlir::Location loc,
mlir::Value hostPtr);

} // namespace fir::runtime::cuda

#endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_
4 changes: 2 additions & 2 deletions flang/lib/Lower/Allocatable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ class AllocateStmtHelper {
void genSimpleAllocation(const Allocation &alloc,
const fir::MutableBoxValue &box) {
bool isCudaSymbol = Fortran::semantics::HasCUDAAttr(alloc.getSymbol());
bool isCudaDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
bool isCudaDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());
bool inlineAllocation = !box.isDerived() && !errorManager.hasStatSpec() &&
!alloc.type.IsPolymorphic() &&
!alloc.hasCoarraySpec() && !useAllocateRuntime &&
Expand Down Expand Up @@ -862,7 +862,7 @@ genDeallocate(fir::FirOpBuilder &builder,
mlir::Value declaredTypeDesc = {},
const Fortran::semantics::Symbol *symbol = nullptr) {
bool isCudaSymbol = symbol && Fortran::semantics::HasCUDAAttr(*symbol);
bool isCudaDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
bool isCudaDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());
bool inlineDeallocation =
!box.isDerived() && !box.isPolymorphic() && !box.hasAssumedRank() &&
!box.isUnlimitedPolymorphic() && !errorManager.hasStatSpec() &&
Expand Down
2 changes: 1 addition & 1 deletion flang/lib/Lower/Bridge.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4689,7 +4689,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
mlir::Location loc = getCurrentLocation();
fir::FirOpBuilder &builder = getFirOpBuilder();

bool isInDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
bool isInDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());

bool isCUDATransfer =
IsCUDADataTransfer(assign.lhs, assign.rhs) && !isInDeviceContext;
Expand Down
1 change: 1 addition & 0 deletions flang/lib/Optimizer/Builder/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ add_flang_library(FIRBuilder
Runtime/Assign.cpp
Runtime/Character.cpp
Runtime/Command.cpp
Runtime/CUDA/Descriptor.cpp
Runtime/Derived.cpp
Runtime/EnvironmentDefaults.cpp
Runtime/Exceptions.cpp
Expand Down
40 changes: 28 additions & 12 deletions flang/lib/Optimizer/Builder/CUFCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "flang/Optimizer/HLFIR/HLFIROps.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/Dialect/OpenACC/OpenACC.h"

/// Retrieve or create the CUDA Fortran GPU module in the give in \p mod.
mlir::gpu::GPUModuleOp cuf::getOrCreateGPUModule(mlir::ModuleOp mod,
Expand All @@ -31,32 +32,47 @@ mlir::gpu::GPUModuleOp cuf::getOrCreateGPUModule(mlir::ModuleOp mod,
return gpuMod;
}

bool cuf::isInCUDADeviceContext(mlir::Operation *op) {
if (!op)
bool cuf::isCUDADeviceContext(mlir::Operation *op) {
if (!op || !op->getParentRegion())
return false;
if (op->getParentOfType<cuf::KernelOp>() ||
op->getParentOfType<mlir::gpu::GPUFuncOp>())
return isCUDADeviceContext(*op->getParentRegion());
}

// Check if the insertion point is currently in a device context. HostDevice
// subprogram are not considered fully device context so it will return false
// for it.
// If the insertion point is inside an OpenACC region op, it is considered
// device context.
bool cuf::isCUDADeviceContext(mlir::Region &region) {
if (region.getParentOfType<cuf::KernelOp>())
return true;
if (region.getParentOfType<mlir::acc::ComputeRegionOpInterface>())
return true;
if (auto funcOp = op->getParentOfType<mlir::func::FuncOp>()) {
if (auto cudaProcAttr = funcOp->getAttrOfType<cuf::ProcAttributeAttr>(
cuf::getProcAttrName())) {
return cudaProcAttr.getValue() != cuf::ProcAttribute::Host;
if (auto funcOp = region.getParentOfType<mlir::func::FuncOp>()) {
if (auto cudaProcAttr =
funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
cuf::getProcAttrName())) {
return cudaProcAttr.getValue() != cuf::ProcAttribute::Host &&
cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
}
}
return false;
}

bool cuf::isRegisteredDeviceGlobal(fir::GlobalOp op) {
if (op.getConstant())
return false;
auto attr = op.getDataAttr();
bool cuf::isRegisteredDeviceAttr(std::optional<cuf::DataAttribute> attr) {
if (attr && (*attr == cuf::DataAttribute::Device ||
*attr == cuf::DataAttribute::Managed ||
*attr == cuf::DataAttribute::Constant))
return true;
return false;
}

bool cuf::isRegisteredDeviceGlobal(fir::GlobalOp op) {
if (op.getConstant())
return false;
return isRegisteredDeviceAttr(op.getDataAttr());
}

void cuf::genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder) {
if (auto declareOp = box.getDefiningOp<hlfir::DeclareOp>()) {
if (auto addrOfOp = declareOp.getMemref().getDefiningOp<fir::AddrOfOp>()) {
Expand Down
14 changes: 14 additions & 0 deletions flang/lib/Optimizer/Builder/IntrinsicCall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@
#include "flang/Optimizer/Builder/IntrinsicCall.h"
#include "flang/Common/static-multimap-view.h"
#include "flang/Optimizer/Builder/BoxValue.h"
#include "flang/Optimizer/Builder/CUFCommon.h"
#include "flang/Optimizer/Builder/Character.h"
#include "flang/Optimizer/Builder/Complex.h"
#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Builder/MutableBox.h"
#include "flang/Optimizer/Builder/PPCIntrinsicCall.h"
#include "flang/Optimizer/Builder/Runtime/Allocatable.h"
#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
#include "flang/Optimizer/Builder/Runtime/Character.h"
#include "flang/Optimizer/Builder/Runtime/Command.h"
#include "flang/Optimizer/Builder/Runtime/Derived.h"
Expand All @@ -38,6 +40,7 @@
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/Dialect/FIROpsSupport.h"
#include "flang/Optimizer/Dialect/Support/FIRContext.h"
#include "flang/Optimizer/HLFIR/HLFIROps.h"
#include "flang/Optimizer/Support/FatalError.h"
#include "flang/Optimizer/Support/Utils.h"
#include "flang/Runtime/entry-names.h"
Expand Down Expand Up @@ -3254,6 +3257,17 @@ void IntrinsicLibrary::genCFPointer(llvm::ArrayRef<fir::ExtendedValue> args) {

fir::factory::associateMutableBox(builder, loc, *fPtr, getCPtrExtVal(*fPtr),
/*lbounds=*/mlir::ValueRange{});

// If the pointer is a registered CUDA fortran variable, the descriptor needs
// to be synced.
if (auto declare = mlir::dyn_cast_or_null<hlfir::DeclareOp>(
fPtr->getAddr().getDefiningOp()))
if (declare.getMemref().getDefiningOp() &&
mlir::isa<fir::AddrOfOp>(declare.getMemref().getDefiningOp()))
if (cuf::isRegisteredDeviceAttr(declare.getDataAttr()) &&
!cuf::isCUDADeviceContext(builder.getRegion()))
fir::runtime::cuda::genSyncGlobalDescriptor(builder, loc,
declare.getMemref());
}

// C_F_PROCPOINTER
Expand Down
34 changes: 34 additions & 0 deletions flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@

//===-- Allocatable.cpp -- Allocatable statements lowering ----------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
//
//===----------------------------------------------------------------------===//

#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
#include "flang/Runtime/CUDA/descriptor.h"

using namespace Fortran::runtime::cuda;

void fir::runtime::cuda::genSyncGlobalDescriptor(fir::FirOpBuilder &builder,
mlir::Location loc,
mlir::Value hostPtr) {
mlir::func::FuncOp callee =
fir::runtime::getRuntimeFunc<mkRTKey(CUFSyncGlobalDescriptor)>(loc,
builder);
auto fTy = callee.getFunctionType();
mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
mlir::Value sourceLine =
fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
builder, loc, fTy, hostPtr, sourceFile, sourceLine)};
builder.create<fir::CallOp>(loc, callee, args);
}
12 changes: 2 additions & 10 deletions flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "flang/Optimizer/Transforms/CUFOpConversion.h"
#include "flang/Optimizer/Builder/CUFCommon.h"
#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
#include "flang/Optimizer/CodeGen/TypeConverter.h"
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
Expand Down Expand Up @@ -904,16 +905,7 @@ struct CUFSyncDescriptorOpConversion

auto hostAddr = builder.create<fir::AddrOfOp>(
loc, fir::ReferenceType::get(globalOp.getType()), op.getGlobalName());
mlir::func::FuncOp callee =
fir::runtime::getRuntimeFunc<mkRTKey(CUFSyncGlobalDescriptor)>(loc,
builder);
auto fTy = callee.getFunctionType();
mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
mlir::Value sourceLine =
fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
builder, loc, fTy, hostAddr, sourceFile, sourceLine)};
builder.create<fir::CallOp>(loc, callee, args);
fir::runtime::cuda::genSyncGlobalDescriptor(builder, loc, hostAddr);
op.erase();
return mlir::success();
}
Expand Down
2 changes: 1 addition & 1 deletion flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1277,7 +1277,7 @@ void SimplifyIntrinsicsPass::runOnOperation() {
fir::KindMapping kindMap = fir::getKindMapping(module);
module.walk([&](mlir::Operation *op) {
if (auto call = mlir::dyn_cast<fir::CallOp>(op)) {
if (cuf::isInCUDADeviceContext(op))
if (cuf::isCUDADeviceContext(op))
return;
if (mlir::SymbolRefAttr callee = call.getCalleeAttr()) {
mlir::StringRef funcName = callee.getLeafReference().getValue();
Expand Down
23 changes: 22 additions & 1 deletion flang/test/Lower/CUDA/cuda-pointer.cuf
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,31 @@

! Test lowering of CUDA pointers.

module mod1

integer, device, pointer :: x(:)

contains

subroutine allocate_pointer
real, device, pointer :: pr(:)
allocate(pr(10))
end

! CHECK-LABEL: func.func @_QPallocate_pointer()
! CHECK-LABEL: func.func @_QMmod1Pallocate_pointer()
! CHECK-COUNT-2: fir.embox %{{.*}} {allocator_idx = 2 : i32} : (!fir.ptr<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>

subroutine c_f_pointer_sync
use iso_c_binding
use, intrinsic :: __fortran_builtins, only: c_devptr => __builtin_c_devptr
type(c_devptr) :: cd1
integer, parameter :: N = 2000
call c_f_pointer(cd1, x, (/ 2000 /))
end

! CHECK-LABEL: func.func @_QMmod1Pc_f_pointer_sync()
! CHECK: %[[ADDR_X:.*]] = fir.address_of(@_QMmod1Ex) : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
! CHECK: %[[CONV:.*]] = fir.convert %[[ADDR_X]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<i8>
! CHECK: fir.call @_FortranACUFSyncGlobalDescriptor(%[[CONV]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.ref<i8>, i32) -> ()

end module