Skip to content

Commit 8ff0d3b

Browse files
committed
[MLIR][OpenMP] LLVM IR translation of host_eval
This patch adds support for processing the `host_eval` clause of `omp.target` to populate default and runtime kernel launch attributes. Specifically, these related to the `num_teams`, `thread_limit` and `num_threads` clauses attached to operations nested inside of `omp.target`. As a result, the `thread_limit` clause of `omp.target` is also supported. The implementation of `initTargetDefaultAttrs()` is intended to reflect clang's own processing of multiple constructs and clauses in order to define a default number of teams and threads to be used as kernel attributes and to populate global variables in the target device module. One side effect of this change is that it is no longer possible to translate to LLVM IR target device MLIR modules unless they have a supported target triple. This is because the local `getGridValue()` function in the `OpenMPIRBuilder` only works for certain architectures, and it is called whenever the maximum number of threads has not been explicitly defined. This limitation also matches clang. Support for evaluating the collapsed loop trip count of target SPMD kernels remains unsupported.
1 parent cc5c5cc commit 8ff0d3b

18 files changed

+344
-58
lines changed

flang/test/Integration/OpenMP/target-filtering.f90

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
!===----------------------------------------------------------------------===!
88

99
!RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s --check-prefixes HOST,ALL
10-
!RUN: %flang_fc1 -emit-llvm -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefixes DEVICE,ALL
10+
!RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-llvm -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefixes DEVICE,ALL
1111

1212
!HOST: define {{.*}}@{{.*}}before{{.*}}(
1313
!DEVICE-NOT: define {{.*}}@before{{.*}}(

flang/test/Lower/OpenMP/function-filtering-2.f90

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM,LLVM-HOST %s
22
! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -emit-hlfir %s -o - | FileCheck --check-prefix=MLIR %s
3-
! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM,LLVM-DEVICE %s
4-
! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefix=MLIR %s
3+
! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM,LLVM-DEVICE %s
4+
! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefix=MLIR %s
55
! RUN: bbc -fopenmp -fopenmp-version=52 -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-HOST,MLIR-ALL %s
6-
! RUN: bbc -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
6+
! RUN: bbc -target amdgcn-amd-amdhsa -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
77

88
! MLIR: func.func @{{.*}}implicit_invocation() attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>}
99
! MLIR: return

flang/test/Lower/OpenMP/function-filtering-3.f90

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
! RUN: %flang_fc1 -fopenmp -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM-HOST,LLVM-ALL %s
22
! RUN: %flang_fc1 -fopenmp -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-HOST,MLIR-ALL %s
3-
! RUN: %flang_fc1 -fopenmp -fopenmp-is-target-device -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM-DEVICE,LLVM-ALL %s
4-
! RUN: %flang_fc1 -fopenmp -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
3+
! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -fopenmp -fopenmp-is-target-device -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM-DEVICE,LLVM-ALL %s
4+
! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -fopenmp -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
55
! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-HOST,MLIR-ALL %s
6-
! RUN: bbc -fopenmp -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
6+
! RUN: bbc -target amdgcn-amd-amdhsa -fopenmp -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
77

88
! Check that the correct LLVM IR functions are kept for the host and device
99
! after running the whole set of translation and transformation passes from

flang/test/Lower/OpenMP/function-filtering.f90

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM-HOST,LLVM-ALL %s
22
! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-HOST,MLIR-ALL %s
3-
! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM-DEVICE,LLVM-ALL %s
4-
! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
3+
! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM-DEVICE,LLVM-ALL %s
4+
! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
55
! RUN: bbc -fopenmp -fopenmp-version=52 -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-HOST,MLIR-ALL %s
6-
! RUN: bbc -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
6+
! RUN: bbc -target amdgcn-amd-amdhsa -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
77

88
! Check that the correct LLVM IR functions are kept for the host and device
99
! after running the whole set of translation and transformation passes from

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

Lines changed: 229 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -174,10 +174,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
174174
if (op.getHint())
175175
op.emitWarning("hint clause discarded");
176176
};
177-
auto checkHostEval = [&todo](auto op, LogicalResult &result) {
178-
if (!op.getHostEvalVars().empty())
179-
result = todo("host_eval");
180-
};
181177
auto checkIf = [&todo](auto op, LogicalResult &result) {
182178
if (op.getIfExpr())
183179
result = todo("if");
@@ -228,10 +224,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
228224
op.getReductionSyms())
229225
result = todo("reduction");
230226
};
231-
auto checkThreadLimit = [&todo](auto op, LogicalResult &result) {
232-
if (op.getThreadLimit())
233-
result = todo("thread_limit");
234-
};
235227
auto checkTaskReduction = [&todo](auto op, LogicalResult &result) {
236228
if (!op.getTaskReductionVars().empty() || op.getTaskReductionByref() ||
237229
op.getTaskReductionSyms())
@@ -295,7 +287,16 @@ static LogicalResult checkImplementationStatus(Operation &op) {
295287
checkAllocate(op, result);
296288
checkDevice(op, result);
297289
checkHasDeviceAddr(op, result);
298-
checkHostEval(op, result);
290+
291+
// Host evaluated clauses are supported, except for target SPMD loop
292+
// bounds.
293+
for (BlockArgument arg :
294+
cast<omp::BlockArgOpenMPOpInterface>(*op).getHostEvalBlockArgs())
295+
for (Operation *user : arg.getUsers())
296+
if (isa<omp::LoopNestOp>(user))
297+
result = op.emitError("not yet implemented: host evaluation of "
298+
"loop bounds in omp.target operation");
299+
299300
checkIf(op, result);
300301
checkInReduction(op, result);
301302
checkIsDevicePtr(op, result);
@@ -316,7 +317,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
316317
"structures in omp.target operation");
317318
}
318319
}
319-
checkThreadLimit(op, result);
320320
})
321321
.Default([](Operation &) {
322322
// Assume all clauses for an operation can be translated unless they are
@@ -3800,6 +3800,201 @@ createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg,
38003800
return builder.saveIP();
38013801
}
38023802

3803+
/// Follow uses of `host_eval`-defined block arguments of the given `omp.target`
3804+
/// operation and populate output variables with their corresponding host value
3805+
/// (i.e. operand evaluated outside of the target region), based on their uses
3806+
/// inside of the target region.
3807+
///
3808+
/// Loop bounds and steps are only optionally populated, if output vectors are
3809+
/// provided.
3810+
static void extractHostEvalClauses(omp::TargetOp targetOp, Value &numThreads,
3811+
Value &numTeamsLower, Value &numTeamsUpper,
3812+
Value &threadLimit) {
3813+
auto blockArgIface = llvm::cast<omp::BlockArgOpenMPOpInterface>(*targetOp);
3814+
for (auto item : llvm::zip_equal(targetOp.getHostEvalVars(),
3815+
blockArgIface.getHostEvalBlockArgs())) {
3816+
Value hostEvalVar = std::get<0>(item), blockArg = std::get<1>(item);
3817+
3818+
for (Operation *user : blockArg.getUsers()) {
3819+
llvm::TypeSwitch<Operation *>(user)
3820+
.Case([&](omp::TeamsOp teamsOp) {
3821+
if (teamsOp.getNumTeamsLower() == blockArg)
3822+
numTeamsLower = hostEvalVar;
3823+
else if (teamsOp.getNumTeamsUpper() == blockArg)
3824+
numTeamsUpper = hostEvalVar;
3825+
else if (teamsOp.getThreadLimit() == blockArg)
3826+
threadLimit = hostEvalVar;
3827+
else
3828+
llvm_unreachable("unsupported host_eval use");
3829+
})
3830+
.Case([&](omp::ParallelOp parallelOp) {
3831+
if (parallelOp.getNumThreads() == blockArg)
3832+
numThreads = hostEvalVar;
3833+
else
3834+
llvm_unreachable("unsupported host_eval use");
3835+
})
3836+
.Case([&](omp::LoopNestOp loopOp) {
3837+
// TODO: Extract bounds and step values.
3838+
})
3839+
.Default([](Operation *) {
3840+
llvm_unreachable("unsupported host_eval use");
3841+
});
3842+
}
3843+
}
3844+
}
3845+
3846+
/// If \p op is of the given type parameter, return it casted to that type.
3847+
/// Otherwise, if its immediate parent operation (or some other higher-level
3848+
/// parent, if \p immediateParent is false) is of that type, return that parent
3849+
/// casted to the given type.
3850+
///
3851+
/// If \p op is \c null or neither it or its parent(s) are of the specified
3852+
/// type, return a \c null operation.
3853+
template <typename OpTy>
3854+
static OpTy castOrGetParentOfType(Operation *op, bool immediateParent = false) {
3855+
if (!op)
3856+
return OpTy();
3857+
3858+
if (OpTy casted = dyn_cast<OpTy>(op))
3859+
return casted;
3860+
3861+
if (immediateParent)
3862+
return dyn_cast_if_present<OpTy>(op->getParentOp());
3863+
3864+
return op->getParentOfType<OpTy>();
3865+
}
3866+
3867+
/// Populate default `MinTeams`, `MaxTeams` and `MaxThreads` to their default
3868+
/// values as stated by the corresponding clauses, if constant.
3869+
///
3870+
/// These default values must be set before the creation of the outlined LLVM
3871+
/// function for the target region, so that they can be used to initialize the
3872+
/// corresponding global `ConfigurationEnvironmentTy` structure.
3873+
static void
3874+
initTargetDefaultAttrs(omp::TargetOp targetOp,
3875+
llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &attrs,
3876+
bool isTargetDevice) {
3877+
Value hostNumThreads, hostNumTeamsLower, hostNumTeamsUpper, hostThreadLimit;
3878+
extractHostEvalClauses(targetOp, hostNumThreads, hostNumTeamsLower,
3879+
hostNumTeamsUpper, hostThreadLimit);
3880+
3881+
// TODO: Handle constant 'if' clauses.
3882+
Operation *capturedOp = targetOp.getInnermostCapturedOmpOp();
3883+
3884+
auto extractConstInteger = [](Value value) -> std::optional<int64_t> {
3885+
if (auto constOp =
3886+
dyn_cast_if_present<LLVM::ConstantOp>(value.getDefiningOp()))
3887+
if (auto constAttr = dyn_cast<IntegerAttr>(constOp.getValue()))
3888+
return constAttr.getInt();
3889+
3890+
return std::nullopt;
3891+
};
3892+
3893+
// Handle clauses impacting the number of teams.
3894+
3895+
int32_t minTeamsVal = 1, maxTeamsVal = -1;
3896+
if (castOrGetParentOfType<omp::TeamsOp>(capturedOp)) {
3897+
// TODO: Use `hostNumTeamsLower` to initialize `minTeamsVal`. For now, match
3898+
// clang and set min and max to the same value.
3899+
if (hostNumTeamsUpper) {
3900+
if (auto val = extractConstInteger(hostNumTeamsUpper))
3901+
minTeamsVal = maxTeamsVal = *val;
3902+
} else {
3903+
minTeamsVal = maxTeamsVal = 0;
3904+
}
3905+
} else if (castOrGetParentOfType<omp::ParallelOp>(capturedOp,
3906+
/*immediateParent=*/true) ||
3907+
castOrGetParentOfType<omp::SimdOp>(capturedOp,
3908+
/*immediateParent=*/true)) {
3909+
minTeamsVal = maxTeamsVal = 1;
3910+
} else {
3911+
minTeamsVal = maxTeamsVal = -1;
3912+
}
3913+
3914+
// Handle clauses impacting the number of threads.
3915+
3916+
auto setMaxValueFromClause = [&extractConstInteger](Value clauseValue,
3917+
int32_t &result) {
3918+
if (!clauseValue)
3919+
return;
3920+
3921+
if (auto val = extractConstInteger(clauseValue))
3922+
result = *val;
3923+
3924+
// Found an applicable clause, so it's not undefined. Mark as unknown
3925+
// because it's not constant.
3926+
if (result < 0)
3927+
result = 0;
3928+
};
3929+
3930+
// Extract 'thread_limit' clause from 'target' and 'teams' directives.
3931+
int32_t targetThreadLimitVal = -1, teamsThreadLimitVal = -1;
3932+
setMaxValueFromClause(targetOp.getThreadLimit(), targetThreadLimitVal);
3933+
setMaxValueFromClause(hostThreadLimit, teamsThreadLimitVal);
3934+
3935+
// Extract 'max_threads' clause from 'parallel' or set to 1 if it's SIMD.
3936+
int32_t maxThreadsVal = -1;
3937+
if (auto parallelOp = castOrGetParentOfType<omp::ParallelOp>(capturedOp))
3938+
setMaxValueFromClause(hostNumThreads, maxThreadsVal);
3939+
else if (castOrGetParentOfType<omp::SimdOp>(capturedOp,
3940+
/*immediateParent=*/true))
3941+
maxThreadsVal = 1;
3942+
3943+
// For max values, < 0 means unset, == 0 means set but unknown. Select the
3944+
// minimum value between 'max_threads' and 'thread_limit' clauses that were
3945+
// set.
3946+
int32_t combinedMaxThreadsVal = targetThreadLimitVal;
3947+
if (combinedMaxThreadsVal < 0 ||
3948+
(teamsThreadLimitVal >= 0 && teamsThreadLimitVal < combinedMaxThreadsVal))
3949+
combinedMaxThreadsVal = teamsThreadLimitVal;
3950+
3951+
if (combinedMaxThreadsVal < 0 ||
3952+
(maxThreadsVal >= 0 && maxThreadsVal < combinedMaxThreadsVal))
3953+
combinedMaxThreadsVal = maxThreadsVal;
3954+
3955+
// Update kernel bounds structure for the `OpenMPIRBuilder` to use.
3956+
attrs.MinTeams = minTeamsVal;
3957+
attrs.MaxTeams.front() = maxTeamsVal;
3958+
attrs.MinThreads = 1;
3959+
attrs.MaxThreads.front() = combinedMaxThreadsVal;
3960+
}
3961+
3962+
/// Gather LLVM runtime values for all clauses evaluated in the host that are
3963+
/// passed to the kernel invocation.
3964+
///
3965+
/// This function must be called only when compiling for the host. Also, it will
3966+
/// only provide correct results if it's called after the body of \c targetOp
3967+
/// has been fully generated.
3968+
static void
3969+
initTargetRuntimeAttrs(llvm::IRBuilderBase &builder,
3970+
LLVM::ModuleTranslation &moduleTranslation,
3971+
omp::TargetOp targetOp,
3972+
llvm::OpenMPIRBuilder::TargetKernelRuntimeAttrs &attrs) {
3973+
Value numThreads, numTeamsLower, numTeamsUpper, teamsThreadLimit;
3974+
extractHostEvalClauses(targetOp, numThreads, numTeamsLower, numTeamsUpper,
3975+
teamsThreadLimit);
3976+
3977+
// TODO: Handle constant 'if' clauses.
3978+
if (Value targetThreadLimit = targetOp.getThreadLimit())
3979+
attrs.TargetThreadLimit.front() =
3980+
moduleTranslation.lookupValue(targetThreadLimit);
3981+
3982+
if (numTeamsLower)
3983+
attrs.MinTeams = moduleTranslation.lookupValue(numTeamsLower);
3984+
3985+
if (numTeamsUpper)
3986+
attrs.MaxTeams.front() = moduleTranslation.lookupValue(numTeamsUpper);
3987+
3988+
if (teamsThreadLimit)
3989+
attrs.TeamsThreadLimit.front() =
3990+
moduleTranslation.lookupValue(teamsThreadLimit);
3991+
3992+
if (numThreads)
3993+
attrs.MaxThreads = moduleTranslation.lookupValue(numThreads);
3994+
3995+
// TODO: Populate attrs.LoopTripCount if it is target SPMD.
3996+
}
3997+
38033998
static LogicalResult
38043999
convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
38054000
LLVM::ModuleTranslation &moduleTranslation) {
@@ -3809,12 +4004,13 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
38094004

38104005
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
38114006
bool isTargetDevice = ompBuilder->Config.isTargetDevice();
4007+
38124008
auto parentFn = opInst.getParentOfType<LLVM::LLVMFuncOp>();
4009+
auto blockIface = cast<omp::BlockArgOpenMPOpInterface>(opInst);
38134010
auto &targetRegion = targetOp.getRegion();
38144011
DataLayout dl = DataLayout(opInst.getParentOfType<ModuleOp>());
38154012
SmallVector<Value> mapVars = targetOp.getMapVars();
3816-
ArrayRef<BlockArgument> mapBlockArgs =
3817-
cast<omp::BlockArgOpenMPOpInterface>(opInst).getMapBlockArgs();
4013+
ArrayRef<BlockArgument> mapBlockArgs = blockIface.getMapBlockArgs();
38184014
llvm::Function *llvmOutlinedFn = nullptr;
38194015

38204016
// TODO: It can also be false if a compile-time constant `false` IF clause is
@@ -3857,7 +4053,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
38574053
OperandRange privateVars = targetOp.getPrivateVars();
38584054
std::optional<ArrayAttr> privateSyms = targetOp.getPrivateSyms();
38594055
MutableArrayRef<BlockArgument> privateBlockArgs =
3860-
cast<omp::BlockArgOpenMPOpInterface>(opInst).getPrivateBlockArgs();
4056+
blockIface.getPrivateBlockArgs();
38614057

38624058
for (auto [privVar, privatizerNameAttr, privBlockArg] :
38634059
llvm::zip_equal(privateVars, *privateSyms, privateBlockArgs)) {
@@ -3936,13 +4132,30 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
39364132
allocaIP, codeGenIP);
39374133
};
39384134

3939-
// TODO: Populate default and runtime attributes based on the construct and
3940-
// clauses.
4135+
llvm::SmallVector<llvm::Value *, 4> kernelInput;
39414136
llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs defaultAttrs = {
39424137
/*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0}, /*MinThreads=*/0};
4138+
initTargetDefaultAttrs(targetOp, defaultAttrs, isTargetDevice);
4139+
4140+
// Collect host-evaluated values needed to properly launch the kernel from the
4141+
// host.
39434142
llvm::OpenMPIRBuilder::TargetKernelRuntimeAttrs runtimeAttrs;
4143+
if (!isTargetDevice)
4144+
initTargetRuntimeAttrs(builder, moduleTranslation, targetOp, runtimeAttrs);
4145+
4146+
// Pass host-evaluated values as parameters to the kernel / host fallback,
4147+
// except if they are constants. In any case, map the MLIR block argument to
4148+
// the corresponding LLVM values.
4149+
SmallVector<Value> hostEvalVars = targetOp.getHostEvalVars();
4150+
ArrayRef<BlockArgument> hostEvalBlockArgs = blockIface.getHostEvalBlockArgs();
4151+
for (auto [arg, var] : llvm::zip_equal(hostEvalBlockArgs, hostEvalVars)) {
4152+
llvm::Value *value = moduleTranslation.lookupValue(var);
4153+
moduleTranslation.mapValue(arg, value);
4154+
4155+
if (!llvm::isa<llvm::Constant>(value))
4156+
kernelInput.push_back(value);
4157+
}
39444158

3945-
llvm::SmallVector<llvm::Value *, 4> kernelInput;
39464159
for (size_t i = 0; i < mapVars.size(); ++i) {
39474160
// declare target arguments are not passed to kernels as arguments
39484161
// TODO: We currently do not handle cases where a member is explicitly

mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
22

3-
module attributes {omp.is_target_device = true} {
3+
module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true} {
44
llvm.func @_QQmain() attributes {fir.bindc_name = "main"} {
55
%0 = llvm.mlir.addressof @_QFEi : !llvm.ptr
66
%1 = llvm.mlir.addressof @_QFEsp : !llvm.ptr
@@ -23,7 +23,7 @@ module attributes {omp.is_target_device = true} {
2323
}
2424
}
2525

26-
// CHECK: define {{.*}} void @__omp_offloading_{{.*}}_{{.*}}__QQmain_l{{.*}}(ptr %[[DYN_PTR:.*]], ptr %[[ARG_BYREF:.*]], ptr %[[ARG_BYCOPY:.*]]) {
26+
// CHECK: define {{.*}} void @__omp_offloading_{{.*}}_{{.*}}__QQmain_l{{.*}}(ptr %[[DYN_PTR:.*]], ptr %[[ARG_BYREF:.*]], ptr %[[ARG_BYCOPY:.*]]) #{{[0-9]+}} {
2727

2828
// CHECK: entry:
2929
// CHECK: %[[ALLOCA_BYREF:.*]] = alloca ptr, align 8

mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
// constant sized) allocations performs its task reasonably in these
1111
// scenarios.
1212

13-
module attributes {omp.is_target_device = true} {
13+
module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true} {
1414
llvm.func @_QQmain() attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>} {
1515
%1 = llvm.mlir.constant(1 : i64) : i64
1616
%2 = llvm.alloca %1 x !llvm.struct<(ptr)> : (i64) -> !llvm.ptr
@@ -33,7 +33,7 @@ module attributes {omp.is_target_device = true} {
3333
llvm.func @_ExternalCall(!llvm.ptr, !llvm.ptr) -> !llvm.struct<()>
3434
}
3535

36-
// CHECK: define weak_odr protected void @{{.*}}QQmain_l{{.*}}({{.*}}, {{.*}}) {
36+
// CHECK: define weak_odr protected amdgpu_kernel void @{{.*}}QQmain_l{{.*}}({{.*}}, {{.*}}) #{{[0-9]+}} {
3737
// CHECK-NEXT: entry:
3838
// CHECK-NEXT: %[[MOVED_ALLOCA1:.*]] = alloca { ptr }, align 8
3939
// CHECK-NEXT: %[[MOVED_ALLOCA2:.*]] = alloca i32, i64 1, align 4

0 commit comments

Comments
 (0)