Skip to content

Commit 446899e

Browse files
authored
[MLIR][OpenMP] Host lowering of distribute-parallel-do/for (#127819)
This patch adds support for translating composite `omp.parallel` + `omp.distribute` + `omp.wsloop` loops to LLVM IR on the host. This is done by passing an updated `WorksharingLoopType` to the call to `applyWorkshareLoop` associated to the lowering of the `omp.wsloop` operation, so that `__kmpc_dist_for_static_init` is called at runtime in place of `__kmpc_for_static_init`. Existing translation rules take care of creating a parallel region to hold the workshared and workdistributed loop.
1 parent 9fc2f78 commit 446899e

File tree

3 files changed

+67
-24
lines changed

3 files changed

+67
-24
lines changed

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -257,10 +257,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
257257
LogicalResult result = success();
258258
llvm::TypeSwitch<Operation &>(op)
259259
.Case([&](omp::DistributeOp op) {
260-
if (op.isComposite() &&
261-
isa_and_present<omp::WsloopOp>(op.getNestedWrapper()))
262-
result = op.emitError() << "not yet implemented: "
263-
"composite omp.distribute + omp.wsloop";
264260
checkAllocate(op, result);
265261
checkDistSchedule(op, result);
266262
checkOrder(op, result);
@@ -1990,6 +1986,14 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
19901986
bool isSimd = wsloopOp.getScheduleSimd();
19911987
bool loopNeedsBarrier = !wsloopOp.getNowait();
19921988

1989+
// The only legal way for the direct parent to be omp.distribute is that this
1990+
// represents 'distribute parallel do'. Otherwise, this is a regular
1991+
// worksharing loop.
1992+
llvm::omp::WorksharingLoopType workshareLoopType =
1993+
llvm::isa_and_present<omp::DistributeOp>(opInst.getParentOp())
1994+
? llvm::omp::WorksharingLoopType::DistributeForStaticLoop
1995+
: llvm::omp::WorksharingLoopType::ForStaticLoop;
1996+
19931997
llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
19941998
llvm::Expected<llvm::BasicBlock *> regionBlock = convertOmpOpRegions(
19951999
wsloopOp.getRegion(), "omp.wsloop.region", builder, moduleTranslation);
@@ -2005,7 +2009,8 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
20052009
ompLoc.DL, loopInfo, allocaIP, loopNeedsBarrier,
20062010
convertToScheduleKind(schedule), chunk, isSimd,
20072011
scheduleMod == omp::ScheduleModifier::monotonic,
2008-
scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered);
2012+
scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered,
2013+
workshareLoopType);
20092014

20102015
if (failed(handleError(wsloopIP, opInst)))
20112016
return failure();
@@ -3896,6 +3901,12 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
38963901
return regionBlock.takeError();
38973902
builder.SetInsertPoint(*regionBlock, (*regionBlock)->begin());
38983903

3904+
// Skip applying a workshare loop below when translating 'distribute
3905+
// parallel do' (it's been already handled by this point while translating
3906+
// the nested omp.wsloop).
3907+
if (isa_and_present<omp::WsloopOp>(distributeOp.getNestedWrapper()))
3908+
return llvm::Error::success();
3909+
38993910
// TODO: Add support for clauses which are valid for DISTRIBUTE constructs.
39003911
// Static schedule is the default.
39013912
auto schedule = omp::ClauseScheduleKind::Static;

mlir/test/Target/LLVMIR/openmp-llvm.mlir

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3307,3 +3307,54 @@ llvm.func @distribute() {
33073307
// CHECK: store i64 1, ptr %[[STRIDE]]
33083308
// CHECK: %[[TID:.*]] = call i32 @__kmpc_global_thread_num({{.*}})
33093309
// CHECK: call void @__kmpc_for_static_init_{{.*}}(ptr @{{.*}}, i32 %[[TID]], i32 92, ptr %[[LASTITER]], ptr %[[LB]], ptr %[[UB]], ptr %[[STRIDE]], i64 1, i64 0)
3310+
3311+
// -----
3312+
3313+
llvm.func @distribute_wsloop(%lb : i32, %ub : i32, %step : i32) {
3314+
omp.parallel {
3315+
omp.distribute {
3316+
omp.wsloop {
3317+
omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
3318+
omp.yield
3319+
}
3320+
} {omp.composite}
3321+
} {omp.composite}
3322+
omp.terminator
3323+
} {omp.composite}
3324+
llvm.return
3325+
}
3326+
3327+
// CHECK-LABEL: define void @distribute_wsloop
3328+
// CHECK: call void{{.*}}@__kmpc_fork_call({{.*}}, ptr @[[OUTLINED_PARALLEL:.*]],
3329+
3330+
// CHECK: define internal void @[[OUTLINED_PARALLEL]]
3331+
// CHECK: call void @[[OUTLINED_DISTRIBUTE:.*]]({{.*}})
3332+
3333+
// CHECK: define internal void @[[OUTLINED_DISTRIBUTE]]
3334+
// CHECK: %[[LASTITER:.*]] = alloca i32
3335+
// CHECK: %[[LB:.*]] = alloca i32
3336+
// CHECK: %[[UB:.*]] = alloca i32
3337+
// CHECK: %[[STRIDE:.*]] = alloca i32
3338+
// CHECK: br label %[[AFTER_ALLOCA:.*]]
3339+
3340+
// CHECK: [[AFTER_ALLOCA]]:
3341+
// CHECK: br label %[[DISTRIBUTE_BODY:.*]]
3342+
3343+
// CHECK: [[DISTRIBUTE_BODY]]:
3344+
// CHECK-NEXT: br label %[[DISTRIBUTE_REGION:.*]]
3345+
3346+
// CHECK: [[DISTRIBUTE_REGION]]:
3347+
// CHECK-NEXT: br label %[[WSLOOP_REGION:.*]]
3348+
3349+
// CHECK: [[WSLOOP_REGION]]:
3350+
// CHECK: %omp_loop.tripcount = select {{.*}}
3351+
// CHECK-NEXT: br label %[[PREHEADER:.*]]
3352+
3353+
// CHECK: [[PREHEADER]]:
3354+
// CHECK: store i32 0, ptr %[[LB]]
3355+
// CHECK: %[[TRIPCOUNT:.*]] = sub i32 %omp_loop.tripcount, 1
3356+
// CHECK: store i32 %[[TRIPCOUNT]], ptr %[[UB]]
3357+
// CHECK: store i32 1, ptr %[[STRIDE]]
3358+
// CHECK: %[[TID:.*]] = call i32 @__kmpc_global_thread_num({{.*}})
3359+
// CHECK: %[[DIST_UB:.*]] = alloca i32
3360+
// CHECK: call void @__kmpc_dist_for_static_init_{{.*}}(ptr @{{.*}}, i32 %[[TID]], i32 34, ptr %[[LASTITER]], ptr %[[LB]], ptr %[[UB]], ptr %[[DIST_UB]], ptr %[[STRIDE]], i32 1, i32 0)

mlir/test/Target/LLVMIR/openmp-todo.mlir

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -66,25 +66,6 @@ llvm.func @do_simd(%lb : i32, %ub : i32, %step : i32) {
6666

6767
// -----
6868

69-
llvm.func @distribute_wsloop(%lb : i32, %ub : i32, %step : i32) {
70-
// expected-error@below {{LLVM Translation failed for operation: omp.parallel}}
71-
omp.parallel {
72-
// expected-error@below {{not yet implemented: composite omp.distribute + omp.wsloop}}
73-
// expected-error@below {{LLVM Translation failed for operation: omp.distribute}}
74-
omp.distribute {
75-
omp.wsloop {
76-
omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
77-
omp.yield
78-
}
79-
} {omp.composite}
80-
} {omp.composite}
81-
omp.terminator
82-
} {omp.composite}
83-
llvm.return
84-
}
85-
86-
// -----
87-
8869
llvm.func @distribute_allocate(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
8970
// expected-error@below {{not yet implemented: Unhandled clause allocate in omp.distribute operation}}
9071
// expected-error@below {{LLVM Translation failed for operation: omp.distribute}}

0 commit comments

Comments
 (0)