Skip to content

Commit d4c519e

Browse files
authored
[flang][cuda] Do inline allocation/deallocation in device code (#106628)
ALLOCATE and DEALLOCATE statements can be inlined in device function. This patch updates the condition that determined to inline these actions in lowering. This avoid runtime calls in device function code and can speed up the execution. Also move `isCudaDeviceContext` from `Bridge.cpp` so it can be used elsewhere.
1 parent e00e9a3 commit d4c519e

File tree

4 files changed

+84
-32
lines changed

4 files changed

+84
-32
lines changed

flang/include/flang/Lower/Cuda.h

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
//===-- Lower/Cuda.h -- Cuda Fortran utilities ------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#ifndef FORTRAN_LOWER_CUDA_H
14+
#define FORTRAN_LOWER_CUDA_H
15+
16+
#include "flang/Optimizer/Builder/FIRBuilder.h"
17+
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
18+
#include "flang/Semantics/tools.h"
19+
#include "mlir/Dialect/Func/IR/FuncOps.h"
20+
#include "mlir/Dialect/OpenACC/OpenACC.h"
21+
22+
namespace Fortran::lower {
23+
// Check if the insertion point is currently in a device context. HostDevice
24+
// subprogram are not considered fully device context so it will return false
25+
// for it.
26+
// If the insertion point is inside an OpenACC region op, it is considered
27+
// device context.
28+
static bool isCudaDeviceContext(fir::FirOpBuilder &builder) {
29+
if (builder.getRegion().getParentOfType<cuf::KernelOp>())
30+
return true;
31+
if (builder.getRegion()
32+
.getParentOfType<mlir::acc::ComputeRegionOpInterface>())
33+
return true;
34+
if (auto funcOp = builder.getRegion().getParentOfType<mlir::func::FuncOp>()) {
35+
if (auto cudaProcAttr =
36+
funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
37+
cuf::getProcAttrName())) {
38+
return cudaProcAttr.getValue() != cuf::ProcAttribute::Host &&
39+
cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
40+
}
41+
}
42+
return false;
43+
}
44+
} // end namespace Fortran::lower
45+
46+
#endif // FORTRAN_LOWER_CUDA_H

flang/lib/Lower/Allocatable.cpp

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "flang/Lower/AbstractConverter.h"
1616
#include "flang/Lower/ConvertType.h"
1717
#include "flang/Lower/ConvertVariable.h"
18+
#include "flang/Lower/Cuda.h"
1819
#include "flang/Lower/IterationSpace.h"
1920
#include "flang/Lower/Mangler.h"
2021
#include "flang/Lower/OpenACC.h"
@@ -453,16 +454,22 @@ class AllocateStmtHelper {
453454

454455
void genSimpleAllocation(const Allocation &alloc,
455456
const fir::MutableBoxValue &box) {
456-
if (!box.isDerived() && !errorManager.hasStatSpec() &&
457-
!alloc.type.IsPolymorphic() && !alloc.hasCoarraySpec() &&
458-
!useAllocateRuntime && !box.isPointer() &&
459-
!Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) {
457+
bool isCudaSymbol = Fortran::semantics::HasCUDAAttr(alloc.getSymbol());
458+
bool isCudaDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
459+
bool inlineAllocation = !box.isDerived() && !errorManager.hasStatSpec() &&
460+
!alloc.type.IsPolymorphic() &&
461+
!alloc.hasCoarraySpec() && !useAllocateRuntime &&
462+
!box.isPointer();
463+
464+
if (inlineAllocation &&
465+
((isCudaSymbol && isCudaDeviceContext) || !isCudaSymbol)) {
460466
// Pointers must use PointerAllocate so that their deallocations
461467
// can be validated.
462468
genInlinedAllocation(alloc, box);
463469
postAllocationAction(alloc);
464470
return;
465471
}
472+
466473
// Generate a sequence of runtime calls.
467474
errorManager.genStatCheck(builder, loc);
468475
genAllocateObjectInit(box);
@@ -473,7 +480,7 @@ class AllocateStmtHelper {
473480
genSetDeferredLengthParameters(alloc, box);
474481
genAllocateObjectBounds(alloc, box);
475482
mlir::Value stat;
476-
if (!Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
483+
if (!isCudaSymbol)
477484
stat = genRuntimeAllocate(builder, loc, box, errorManager);
478485
else
479486
stat =
@@ -830,10 +837,14 @@ genDeallocate(fir::FirOpBuilder &builder,
830837
mlir::Value declaredTypeDesc = {},
831838
const Fortran::semantics::Symbol *symbol = nullptr) {
832839
bool isCudaSymbol = symbol && Fortran::semantics::HasCUDAAttr(*symbol);
833-
// Deallocate intrinsic types inline.
834-
if (!box.isDerived() && !box.isPolymorphic() && !box.hasAssumedRank() &&
840+
bool isCudaDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
841+
bool inlineDeallocation =
842+
!box.isDerived() && !box.isPolymorphic() && !box.hasAssumedRank() &&
835843
!box.isUnlimitedPolymorphic() && !errorManager.hasStatSpec() &&
836-
!useAllocateRuntime && !box.isPointer() && !isCudaSymbol) {
844+
!useAllocateRuntime && !box.isPointer();
845+
// Deallocate intrinsic types inline.
846+
if (inlineDeallocation &&
847+
((isCudaSymbol && isCudaDeviceContext) || !isCudaSymbol)) {
837848
// Pointers must use PointerDeallocate so that their deallocations
838849
// can be validated.
839850
mlir::Value ret = fir::factory::genFreemem(builder, loc, box);

flang/lib/Lower/Bridge.cpp

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "flang/Lower/ConvertExprToHLFIR.h"
2121
#include "flang/Lower/ConvertType.h"
2222
#include "flang/Lower/ConvertVariable.h"
23+
#include "flang/Lower/Cuda.h"
2324
#include "flang/Lower/HostAssociations.h"
2425
#include "flang/Lower/IO.h"
2526
#include "flang/Lower/IterationSpace.h"
@@ -4377,36 +4378,13 @@ class FirConverter : public Fortran::lower::AbstractConverter {
43774378
return temps;
43784379
}
43794380

4380-
// Check if the insertion point is currently in a device context. HostDevice
4381-
// subprogram are not considered fully device context so it will return false
4382-
// for it.
4383-
// If the insertion point is inside an OpenACC region op, it is considered
4384-
// device context.
4385-
static bool isCudaDeviceContext(fir::FirOpBuilder &builder) {
4386-
if (builder.getRegion().getParentOfType<cuf::KernelOp>())
4387-
return true;
4388-
if (builder.getRegion()
4389-
.getParentOfType<mlir::acc::ComputeRegionOpInterface>())
4390-
return true;
4391-
if (auto funcOp =
4392-
builder.getRegion().getParentOfType<mlir::func::FuncOp>()) {
4393-
if (auto cudaProcAttr =
4394-
funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
4395-
cuf::getProcAttrName())) {
4396-
return cudaProcAttr.getValue() != cuf::ProcAttribute::Host &&
4397-
cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
4398-
}
4399-
}
4400-
return false;
4401-
}
4402-
44034381
void genDataAssignment(
44044382
const Fortran::evaluate::Assignment &assign,
44054383
const Fortran::evaluate::ProcedureRef *userDefinedAssignment) {
44064384
mlir::Location loc = getCurrentLocation();
44074385
fir::FirOpBuilder &builder = getFirOpBuilder();
44084386

4409-
bool isInDeviceContext = isCudaDeviceContext(builder);
4387+
bool isInDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
44104388

44114389
bool isCUDATransfer = (Fortran::evaluate::HasCUDADeviceAttrs(assign.lhs) ||
44124390
Fortran::evaluate::HasCUDADeviceAttrs(assign.rhs)) &&

flang/test/Lower/CUDA/cuda-allocatable.cuf

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,3 +164,20 @@ end subroutine
164164
! CHECK: %{{.*}} = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
165165
! CHECK: }
166166
! CHECK: cuf.free %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>}
167+
168+
attributes(global) subroutine sub8()
169+
real, device, allocatable :: a(:)
170+
allocate(a(2))
171+
deallocate(a)
172+
end subroutine
173+
174+
! CHECK-LABEL: func.func @_QPsub8() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
175+
! CHECK: %[[DESC:.*]] = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub8Ea"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
176+
! CHECK: %[[A:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub8Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
177+
! CHECK: %[[HEAP:.*]] = fir.allocmem !fir.array<?xf32>, %{{.*}} {fir.must_be_heap = true, uniq_name = "_QFsub8Ea.alloc"}
178+
! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
179+
! CHECK: %[[EMBOX:.*]] = fir.embox %[[HEAP]](%[[SHAPE]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
180+
! CHECK: fir.store %[[EMBOX]] to %[[A]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
181+
! CHECK: %[[BOX:.*]] = fir.load %[[A]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
182+
! CHECK: %[[BOXADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
183+
! CHECK: fir.freemem %[[BOXADDR]] : !fir.heap<!fir.array<?xf32>>

0 commit comments

Comments
 (0)