Skip to content

Commit 9708d09

Browse files
authored
[MLIR][OpenMP] Skip host omp ops when compiling for the target device (#85239)
This patch separates the lowering dispatch for host and target devices. For the target device, if the current operation is not a top-level operation (e.g. omp.target) or is inside a target device code region it will be ignored, since it belongs to the host code. This is an alternative approach to #84611, the new test in this PR was taken from there.
1 parent 379628d commit 9708d09

8 files changed

+314
-118
lines changed

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

Lines changed: 178 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -3116,6 +3116,174 @@ convertDeclareTargetAttr(Operation *op, mlir::omp::DeclareTargetAttr attribute,
31163116
return success();
31173117
}
31183118

3119+
// Returns true if the operation is inside a TargetOp or
3120+
// is part of a declare target function.
3121+
static bool isTargetDeviceOp(Operation *op) {
3122+
// Assumes no reverse offloading
3123+
if (op->getParentOfType<omp::TargetOp>())
3124+
return true;
3125+
3126+
if (auto parentFn = op->getParentOfType<LLVM::LLVMFuncOp>())
3127+
if (auto declareTargetIface =
3128+
llvm::dyn_cast<mlir::omp::DeclareTargetInterface>(
3129+
parentFn.getOperation()))
3130+
if (declareTargetIface.isDeclareTarget() &&
3131+
declareTargetIface.getDeclareTargetDeviceType() !=
3132+
mlir::omp::DeclareTargetDeviceType::host)
3133+
return true;
3134+
3135+
return false;
3136+
}
3137+
3138+
/// Given an OpenMP MLIR operation, create the corresponding LLVM IR
3139+
/// (including OpenMP runtime calls).
3140+
static LogicalResult
3141+
convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder,
3142+
LLVM::ModuleTranslation &moduleTranslation) {
3143+
3144+
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
3145+
3146+
return llvm::TypeSwitch<Operation *, LogicalResult>(op)
3147+
.Case([&](omp::BarrierOp) {
3148+
ompBuilder->createBarrier(builder.saveIP(), llvm::omp::OMPD_barrier);
3149+
return success();
3150+
})
3151+
.Case([&](omp::TaskwaitOp) {
3152+
ompBuilder->createTaskwait(builder.saveIP());
3153+
return success();
3154+
})
3155+
.Case([&](omp::TaskyieldOp) {
3156+
ompBuilder->createTaskyield(builder.saveIP());
3157+
return success();
3158+
})
3159+
.Case([&](omp::FlushOp) {
3160+
// No support in Openmp runtime function (__kmpc_flush) to accept
3161+
// the argument list.
3162+
// OpenMP standard states the following:
3163+
// "An implementation may implement a flush with a list by ignoring
3164+
// the list, and treating it the same as a flush without a list."
3165+
//
3166+
// The argument list is discarded so that, flush with a list is treated
3167+
// same as a flush without a list.
3168+
ompBuilder->createFlush(builder.saveIP());
3169+
return success();
3170+
})
3171+
.Case([&](omp::ParallelOp op) {
3172+
return convertOmpParallel(op, builder, moduleTranslation);
3173+
})
3174+
.Case([&](omp::ReductionOp reductionOp) {
3175+
return convertOmpReductionOp(reductionOp, builder, moduleTranslation);
3176+
})
3177+
.Case([&](omp::MasterOp) {
3178+
return convertOmpMaster(*op, builder, moduleTranslation);
3179+
})
3180+
.Case([&](omp::CriticalOp) {
3181+
return convertOmpCritical(*op, builder, moduleTranslation);
3182+
})
3183+
.Case([&](omp::OrderedRegionOp) {
3184+
return convertOmpOrderedRegion(*op, builder, moduleTranslation);
3185+
})
3186+
.Case([&](omp::OrderedOp) {
3187+
return convertOmpOrdered(*op, builder, moduleTranslation);
3188+
})
3189+
.Case([&](omp::WsloopOp) {
3190+
return convertOmpWsloop(*op, builder, moduleTranslation);
3191+
})
3192+
.Case([&](omp::SimdLoopOp) {
3193+
return convertOmpSimdLoop(*op, builder, moduleTranslation);
3194+
})
3195+
.Case([&](omp::AtomicReadOp) {
3196+
return convertOmpAtomicRead(*op, builder, moduleTranslation);
3197+
})
3198+
.Case([&](omp::AtomicWriteOp) {
3199+
return convertOmpAtomicWrite(*op, builder, moduleTranslation);
3200+
})
3201+
.Case([&](omp::AtomicUpdateOp op) {
3202+
return convertOmpAtomicUpdate(op, builder, moduleTranslation);
3203+
})
3204+
.Case([&](omp::AtomicCaptureOp op) {
3205+
return convertOmpAtomicCapture(op, builder, moduleTranslation);
3206+
})
3207+
.Case([&](omp::SectionsOp) {
3208+
return convertOmpSections(*op, builder, moduleTranslation);
3209+
})
3210+
.Case([&](omp::SingleOp op) {
3211+
return convertOmpSingle(op, builder, moduleTranslation);
3212+
})
3213+
.Case([&](omp::TeamsOp op) {
3214+
return convertOmpTeams(op, builder, moduleTranslation);
3215+
})
3216+
.Case([&](omp::TaskOp op) {
3217+
return convertOmpTaskOp(op, builder, moduleTranslation);
3218+
})
3219+
.Case([&](omp::TaskgroupOp op) {
3220+
return convertOmpTaskgroupOp(op, builder, moduleTranslation);
3221+
})
3222+
.Case<omp::YieldOp, omp::TerminatorOp, omp::DeclareReductionOp,
3223+
omp::CriticalDeclareOp>([](auto op) {
3224+
// `yield` and `terminator` can be just omitted. The block structure
3225+
// was created in the region that handles their parent operation.
3226+
// `declare_reduction` will be used by reductions and is not
3227+
// converted directly, skip it.
3228+
// `critical.declare` is only used to declare names of critical
3229+
// sections which will be used by `critical` ops and hence can be
3230+
// ignored for lowering. The OpenMP IRBuilder will create unique
3231+
// name for critical section names.
3232+
return success();
3233+
})
3234+
.Case([&](omp::ThreadprivateOp) {
3235+
return convertOmpThreadprivate(*op, builder, moduleTranslation);
3236+
})
3237+
.Case<omp::TargetDataOp, omp::TargetEnterDataOp, omp::TargetExitDataOp,
3238+
omp::TargetUpdateOp>([&](auto op) {
3239+
return convertOmpTargetData(op, builder, moduleTranslation);
3240+
})
3241+
.Case([&](omp::TargetOp) {
3242+
return convertOmpTarget(*op, builder, moduleTranslation);
3243+
})
3244+
.Case<omp::MapInfoOp, omp::MapBoundsOp, omp::PrivateClauseOp>(
3245+
[&](auto op) {
3246+
// No-op, should be handled by relevant owning operations e.g.
3247+
// TargetOp, TargetEnterDataOp, TargetExitDataOp, TargetDataOp etc.
3248+
// and then discarded
3249+
return success();
3250+
})
3251+
.Default([&](Operation *inst) {
3252+
return inst->emitError("unsupported OpenMP operation: ")
3253+
<< inst->getName();
3254+
});
3255+
}
3256+
3257+
static LogicalResult
3258+
convertTargetDeviceOp(Operation *op, llvm::IRBuilderBase &builder,
3259+
LLVM::ModuleTranslation &moduleTranslation) {
3260+
return convertHostOrTargetOperation(op, builder, moduleTranslation);
3261+
}
3262+
3263+
static LogicalResult
3264+
convertTargetOpsInNest(Operation *op, llvm::IRBuilderBase &builder,
3265+
LLVM::ModuleTranslation &moduleTranslation) {
3266+
if (isa<omp::TargetOp>(op))
3267+
return convertOmpTarget(*op, builder, moduleTranslation);
3268+
if (isa<omp::TargetDataOp>(op))
3269+
return convertOmpTargetData(op, builder, moduleTranslation);
3270+
bool interrupted =
3271+
op->walk<WalkOrder::PreOrder>([&](Operation *oper) {
3272+
if (isa<omp::TargetOp>(oper)) {
3273+
if (failed(convertOmpTarget(*oper, builder, moduleTranslation)))
3274+
return WalkResult::interrupt();
3275+
return WalkResult::skip();
3276+
}
3277+
if (isa<omp::TargetDataOp>(oper)) {
3278+
if (failed(convertOmpTargetData(oper, builder, moduleTranslation)))
3279+
return WalkResult::interrupt();
3280+
return WalkResult::skip();
3281+
}
3282+
return WalkResult::advance();
3283+
}).wasInterrupted();
3284+
return failure(interrupted);
3285+
}
3286+
31193287
namespace {
31203288

31213289
/// Implementation of the dialect interface that converts operations belonging
@@ -3131,8 +3299,8 @@ class OpenMPDialectLLVMIRTranslationInterface
31313299
convertOperation(Operation *op, llvm::IRBuilderBase &builder,
31323300
LLVM::ModuleTranslation &moduleTranslation) const final;
31333301

3134-
/// Given an OpenMP MLIR attribute, create the corresponding LLVM-IR, runtime
3135-
/// calls, or operation amendments
3302+
/// Given an OpenMP MLIR attribute, create the corresponding LLVM-IR,
3303+
/// runtime calls, or operation amendments
31363304
LogicalResult
31373305
amendOperation(Operation *op, ArrayRef<llvm::Instruction *> instructions,
31383306
NamedAttribute attribute,
@@ -3237,116 +3405,15 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation(
32373405
LLVM::ModuleTranslation &moduleTranslation) const {
32383406

32393407
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
3408+
if (ompBuilder->Config.isTargetDevice()) {
3409+
if (isTargetDeviceOp(op)) {
3410+
return convertTargetDeviceOp(op, builder, moduleTranslation);
3411+
} else {
3412+
return convertTargetOpsInNest(op, builder, moduleTranslation);
3413+
}
3414+
}
32403415

3241-
return llvm::TypeSwitch<Operation *, LogicalResult>(op)
3242-
.Case([&](omp::BarrierOp) {
3243-
ompBuilder->createBarrier(builder.saveIP(), llvm::omp::OMPD_barrier);
3244-
return success();
3245-
})
3246-
.Case([&](omp::TaskwaitOp) {
3247-
ompBuilder->createTaskwait(builder.saveIP());
3248-
return success();
3249-
})
3250-
.Case([&](omp::TaskyieldOp) {
3251-
ompBuilder->createTaskyield(builder.saveIP());
3252-
return success();
3253-
})
3254-
.Case([&](omp::FlushOp) {
3255-
// No support in Openmp runtime function (__kmpc_flush) to accept
3256-
// the argument list.
3257-
// OpenMP standard states the following:
3258-
// "An implementation may implement a flush with a list by ignoring
3259-
// the list, and treating it the same as a flush without a list."
3260-
//
3261-
// The argument list is discarded so that, flush with a list is treated
3262-
// same as a flush without a list.
3263-
ompBuilder->createFlush(builder.saveIP());
3264-
return success();
3265-
})
3266-
.Case([&](omp::ParallelOp op) {
3267-
return convertOmpParallel(op, builder, moduleTranslation);
3268-
})
3269-
.Case([&](omp::ReductionOp reductionOp) {
3270-
return convertOmpReductionOp(reductionOp, builder, moduleTranslation);
3271-
})
3272-
.Case([&](omp::MasterOp) {
3273-
return convertOmpMaster(*op, builder, moduleTranslation);
3274-
})
3275-
.Case([&](omp::CriticalOp) {
3276-
return convertOmpCritical(*op, builder, moduleTranslation);
3277-
})
3278-
.Case([&](omp::OrderedRegionOp) {
3279-
return convertOmpOrderedRegion(*op, builder, moduleTranslation);
3280-
})
3281-
.Case([&](omp::OrderedOp) {
3282-
return convertOmpOrdered(*op, builder, moduleTranslation);
3283-
})
3284-
.Case([&](omp::WsloopOp) {
3285-
return convertOmpWsloop(*op, builder, moduleTranslation);
3286-
})
3287-
.Case([&](omp::SimdLoopOp) {
3288-
return convertOmpSimdLoop(*op, builder, moduleTranslation);
3289-
})
3290-
.Case([&](omp::AtomicReadOp) {
3291-
return convertOmpAtomicRead(*op, builder, moduleTranslation);
3292-
})
3293-
.Case([&](omp::AtomicWriteOp) {
3294-
return convertOmpAtomicWrite(*op, builder, moduleTranslation);
3295-
})
3296-
.Case([&](omp::AtomicUpdateOp op) {
3297-
return convertOmpAtomicUpdate(op, builder, moduleTranslation);
3298-
})
3299-
.Case([&](omp::AtomicCaptureOp op) {
3300-
return convertOmpAtomicCapture(op, builder, moduleTranslation);
3301-
})
3302-
.Case([&](omp::SectionsOp) {
3303-
return convertOmpSections(*op, builder, moduleTranslation);
3304-
})
3305-
.Case([&](omp::SingleOp op) {
3306-
return convertOmpSingle(op, builder, moduleTranslation);
3307-
})
3308-
.Case([&](omp::TeamsOp op) {
3309-
return convertOmpTeams(op, builder, moduleTranslation);
3310-
})
3311-
.Case([&](omp::TaskOp op) {
3312-
return convertOmpTaskOp(op, builder, moduleTranslation);
3313-
})
3314-
.Case([&](omp::TaskgroupOp op) {
3315-
return convertOmpTaskgroupOp(op, builder, moduleTranslation);
3316-
})
3317-
.Case<omp::YieldOp, omp::TerminatorOp, omp::DeclareReductionOp,
3318-
omp::CriticalDeclareOp>([](auto op) {
3319-
// `yield` and `terminator` can be just omitted. The block structure
3320-
// was created in the region that handles their parent operation.
3321-
// `declare_reduction` will be used by reductions and is not
3322-
// converted directly, skip it.
3323-
// `critical.declare` is only used to declare names of critical
3324-
// sections which will be used by `critical` ops and hence can be
3325-
// ignored for lowering. The OpenMP IRBuilder will create unique
3326-
// name for critical section names.
3327-
return success();
3328-
})
3329-
.Case([&](omp::ThreadprivateOp) {
3330-
return convertOmpThreadprivate(*op, builder, moduleTranslation);
3331-
})
3332-
.Case<omp::TargetDataOp, omp::TargetEnterDataOp, omp::TargetExitDataOp,
3333-
omp::TargetUpdateOp>([&](auto op) {
3334-
return convertOmpTargetData(op, builder, moduleTranslation);
3335-
})
3336-
.Case([&](omp::TargetOp) {
3337-
return convertOmpTarget(*op, builder, moduleTranslation);
3338-
})
3339-
.Case<omp::MapInfoOp, omp::MapBoundsOp, omp::PrivateClauseOp>(
3340-
[&](auto op) {
3341-
// No-op, should be handled by relevant owning operations e.g.
3342-
// TargetOp, TargetEnterDataOp, TargetExitDataOp, TargetDataOp etc.
3343-
// and then discarded
3344-
return success();
3345-
})
3346-
.Default([&](Operation *inst) {
3347-
return inst->emitError("unsupported OpenMP operation: ")
3348-
<< inst->getName();
3349-
});
3416+
return convertHostOrTargetOperation(op, builder, moduleTranslation);
33503417
}
33513418

33523419
void mlir::registerOpenMPDialectTranslation(DialectRegistry &registry) {

mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
// for nested omp do loop inside omp target region
55

66
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } {
7-
llvm.func @target_parallel_wsloop(%arg0: !llvm.ptr) attributes {
7+
llvm.func @target_parallel_wsloop(%arg0: !llvm.ptr) attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>,
88
target_cpu = "gfx90a",
9-
target_features = #llvm.target_features<["+gfx9-insts", "+wavefrontsize64"]>
10-
} {
9+
target_features = #llvm.target_features<["+gfx9-insts", "+wavefrontsize64"]>}
10+
{
1111
omp.parallel {
1212
%loop_ub = llvm.mlir.constant(9 : i32) : i32
1313
%loop_lb = llvm.mlir.constant(0 : i32) : i32
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
2+
3+
module attributes {omp.is_target_device = true, omp.is_gpu = true} {
4+
llvm.func @omp_target_region_() {
5+
%0 = llvm.mlir.constant(20 : i32) : i32
6+
%1 = llvm.mlir.constant(10 : i32) : i32
7+
%2 = llvm.mlir.constant(1 : i64) : i64
8+
%3 = llvm.alloca %2 x i32 {bindc_name = "a", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEa"} : (i64) -> !llvm.ptr
9+
%4 = llvm.mlir.constant(1 : i64) : i64
10+
%5 = llvm.alloca %4 x i32 {bindc_name = "b", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEb"} : (i64) -> !llvm.ptr
11+
%6 = llvm.mlir.constant(1 : i64) : i64
12+
%7 = llvm.alloca %6 x i32 {bindc_name = "c", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEc"} : (i64) -> !llvm.ptr
13+
llvm.store %1, %3 : i32, !llvm.ptr
14+
llvm.store %0, %5 : i32, !llvm.ptr
15+
omp.task {
16+
%map1 = omp.map.info var_ptr(%3 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
17+
%map2 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
18+
%map3 = omp.map.info var_ptr(%7 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
19+
omp.target map_entries(%map1 -> %arg0, %map2 -> %arg1, %map3 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
20+
^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr):
21+
%8 = llvm.load %arg0 : !llvm.ptr -> i32
22+
%9 = llvm.load %arg1 : !llvm.ptr -> i32
23+
%10 = llvm.add %8, %9 : i32
24+
llvm.store %10, %arg2 : i32, !llvm.ptr
25+
omp.terminator
26+
}
27+
omp.terminator
28+
}
29+
llvm.return
30+
}
31+
32+
llvm.func @omp_target_no_map() {
33+
omp.target {
34+
omp.terminator
35+
}
36+
llvm.return
37+
}
38+
}
39+
40+
// CHECK: define weak_odr protected void @__omp_offloading_{{.*}}_{{.*}}_omp_target_region__l19
41+
// CHECK: ret void

mlir/test/Target/LLVMIR/omptarget-teams-llvm.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
module attributes {omp.is_target_device = true} {
77
llvm.func @foo(i32)
8-
llvm.func @omp_target_teams_shared_simple(%arg0 : i32) {
8+
llvm.func @omp_target_teams_shared_simple(%arg0 : i32) attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} {
99
omp.teams {
1010
llvm.call @foo(%arg0) : (i32) -> ()
1111
omp.terminator

mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
// for nested omp do loop with collapse clause inside omp target region
55

66
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } {
7-
llvm.func @target_collapsed_wsloop(%arg0: !llvm.ptr) {
7+
llvm.func @target_collapsed_wsloop(%arg0: !llvm.ptr) attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} {
88
%loop_ub = llvm.mlir.constant(99 : i32) : i32
99
%loop_lb = llvm.mlir.constant(0 : i32) : i32
1010
%loop_step = llvm.mlir.constant(1 : index) : i32

0 commit comments

Comments
 (0)