-
Notifications
You must be signed in to change notification settings - Fork 14.2k
[MLIR][OpenMP] Host lowering of distribute-parallel-do/for #127819
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-flang-openmp @llvm/pr-subscribers-mlir-openmp Author: Sergio Afonso (skatrak) ChangesThis patch adds support for translating composite Existing translation rules take care of creating a parallel region to hold the workshared and workdistributed loop. Full diff: https://github.com/llvm/llvm-project/pull/127819.diff 3 Files Affected:
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index c8221a9f9854a..7e8a9bdb5b133 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -260,10 +260,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
LogicalResult result = success();
llvm::TypeSwitch<Operation &>(op)
.Case([&](omp::DistributeOp op) {
- if (op.isComposite() &&
- isa_and_present<omp::WsloopOp>(op.getNestedWrapper()))
- result = op.emitError() << "not yet implemented: "
- "composite omp.distribute + omp.wsloop";
checkAllocate(op, result);
checkDistSchedule(op, result);
checkOrder(op, result);
@@ -1993,6 +1989,14 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
bool isSimd = wsloopOp.getScheduleSimd();
bool loopNeedsBarrier = !wsloopOp.getNowait();
+ // The only legal way for the direct parent to be omp.distribute is that this
+ // represents 'distribute parallel do'. Otherwise, this is a regular
+ // worksharing loop.
+ llvm::omp::WorksharingLoopType workshareLoopType =
+ llvm::isa_and_present<omp::DistributeOp>(opInst.getParentOp())
+ ? llvm::omp::WorksharingLoopType::DistributeForStaticLoop
+ : llvm::omp::WorksharingLoopType::ForStaticLoop;
+
llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
llvm::Expected<llvm::BasicBlock *> regionBlock = convertOmpOpRegions(
wsloopOp.getRegion(), "omp.wsloop.region", builder, moduleTranslation);
@@ -2008,7 +2012,8 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
ompLoc.DL, loopInfo, allocaIP, loopNeedsBarrier,
convertToScheduleKind(schedule), chunk, isSimd,
scheduleMod == omp::ScheduleModifier::monotonic,
- scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered);
+ scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered,
+ workshareLoopType);
if (failed(handleError(wsloopIP, opInst)))
return failure();
@@ -3792,6 +3797,12 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
return regionBlock.takeError();
builder.SetInsertPoint(*regionBlock, (*regionBlock)->begin());
+ // Skip applying a workshare loop below when translating 'distribute
+ // parallel do' (it's been already handled by this point while translating
+ // the nested omp.wsloop).
+ if (isa_and_present<omp::WsloopOp>(distributeOp.getNestedWrapper()))
+ return llvm::Error::success();
+
// TODO: Add support for clauses which are valid for DISTRIBUTE constructs.
// Static schedule is the default.
auto schedule = omp::ClauseScheduleKind::Static;
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index a5a490e527d79..d85b149c66811 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -3307,3 +3307,68 @@ llvm.func @distribute() {
// CHECK: store i64 1, ptr %[[STRIDE]]
// CHECK: %[[TID:.*]] = call i32 @__kmpc_global_thread_num({{.*}})
// CHECK: call void @__kmpc_for_static_init_{{.*}}(ptr @{{.*}}, i32 %[[TID]], i32 92, ptr %[[LASTITER]], ptr %[[LB]], ptr %[[UB]], ptr %[[STRIDE]], i64 1, i64 0)
+
+// -----
+
+llvm.func @distribute_wsloop(%lb : i32, %ub : i32, %step : i32) {
+ omp.parallel {
+ omp.distribute {
+ omp.wsloop {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ } {omp.composite}
+ } {omp.composite}
+ omp.terminator
+ } {omp.composite}
+ llvm.return
+}
+
+// CHECK-LABEL: define void @distribute_wsloop
+// CHECK: call void{{.*}}@__kmpc_fork_call({{.*}}, ptr @[[OUTLINED_PARALLEL:.*]],
+
+// CHECK: define internal void @[[OUTLINED_PARALLEL]]({{.*}})
+// CHECK: %[[ARGS:.*]] = alloca { i32, i32, i32, ptr, ptr, ptr, ptr }
+// CHECK: %[[LASTITER_ALLOC:.*]] = alloca i32
+// CHECK: %[[LB_ALLOC:.*]] = alloca i32
+// CHECK: %[[UB_ALLOC:.*]] = alloca i32
+// CHECK: %[[STRIDE_ALLOC:.*]] = alloca i32
+// CHECK: %[[LB_ARG:.*]] = getelementptr {{.*}}, ptr %[[ARGS]], i32 0, i32 3
+// CHECK: store ptr %[[LB_ALLOC]], ptr %[[LB_ARG]]
+// CHECK: %[[UB_ARG:.*]] = getelementptr {{.*}}, ptr %[[ARGS]], i32 0, i32 4
+// CHECK: store ptr %[[UB_ALLOC]], ptr %[[UB_ARG]]
+// CHECK: %[[STRIDE_ARG:.*]] = getelementptr {{.*}}, ptr %[[ARGS]], i32 0, i32 5
+// CHECK: store ptr %[[STRIDE_ALLOC]], ptr %[[STRIDE_ARG]]
+// CHECK: %[[LASTITER_ARG:.*]] = getelementptr {{.*}}, ptr %[[ARGS]], i32 0, i32 6
+// CHECK: store ptr %[[LASTITER_ALLOC]], ptr %[[LASTITER_ARG]]
+// CHECK: call void @[[OUTLINED_DISTRIBUTE:.*]](ptr %[[ARGS]])
+
+// CHECK: define internal void @[[OUTLINED_DISTRIBUTE]](ptr %[[ARGS_STRUCT:.*]])
+// CHECK: %[[LB_PTR:.*]] = getelementptr {{.*}}, ptr %[[ARGS_STRUCT]], i32 0, i32 3
+// CHECK: %[[LB:.*]] = load ptr, ptr %[[LB_PTR]]
+// CHECK: %[[UB_PTR:.*]] = getelementptr {{.*}}, ptr %[[ARGS_STRUCT]], i32 0, i32 4
+// CHECK: %[[UB:.*]] = load ptr, ptr %[[UB_PTR]]
+// CHECK: %[[STRIDE_PTR:.*]] = getelementptr {{.*}}, ptr %[[ARGS_STRUCT]], i32 0, i32 5
+// CHECK: %[[STRIDE:.*]] = load ptr, ptr %[[STRIDE_PTR]]
+// CHECK: %[[LASTITER_PTR:.*]] = getelementptr {{.*}}, ptr %[[ARGS_STRUCT]], i32 0, i32 6
+// CHECK: %[[LASTITER:.*]] = load ptr, ptr %[[LASTITER_PTR]]
+// CHECK: br label %[[DISTRIBUTE_BODY:.*]]
+
+// CHECK: [[DISTRIBUTE_BODY]]:
+// CHECK-NEXT: br label %[[DISTRIBUTE_REGION:.*]]
+
+// CHECK: [[DISTRIBUTE_REGION]]:
+// CHECK-NEXT: br label %[[WSLOOP_REGION:.*]]
+
+// CHECK: [[WSLOOP_REGION]]:
+// CHECK: %omp_loop.tripcount = select {{.*}}
+// CHECK-NEXT: br label %[[PREHEADER:.*]]
+
+// CHECK: [[PREHEADER]]:
+// CHECK: store i32 0, ptr %[[LB]]
+// CHECK: %[[TRIPCOUNT:.*]] = sub i32 %omp_loop.tripcount, 1
+// CHECK: store i32 %[[TRIPCOUNT]], ptr %[[UB]]
+// CHECK: store i32 1, ptr %[[STRIDE]]
+// CHECK: %[[TID:.*]] = call i32 @__kmpc_global_thread_num({{.*}})
+// CHECK: %[[DIST_UB:.*]] = alloca i32
+// CHECK: call void @__kmpc_dist_for_static_init_{{.*}}(ptr @{{.*}}, i32 %[[TID]], i32 34, ptr %[[LASTITER]], ptr %[[LB]], ptr %[[UB]], ptr %[[DIST_UB]], ptr %[[STRIDE]], i32 1, i32 0)
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 71dbc061c3104..d1c745af9bff5 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -66,25 +66,6 @@ llvm.func @do_simd(%lb : i32, %ub : i32, %step : i32) {
// -----
-llvm.func @distribute_wsloop(%lb : i32, %ub : i32, %step : i32) {
- // expected-error@below {{LLVM Translation failed for operation: omp.parallel}}
- omp.parallel {
- // expected-error@below {{not yet implemented: composite omp.distribute + omp.wsloop}}
- // expected-error@below {{LLVM Translation failed for operation: omp.distribute}}
- omp.distribute {
- omp.wsloop {
- omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
- omp.yield
- }
- } {omp.composite}
- } {omp.composite}
- omp.terminator
- } {omp.composite}
- llvm.return
-}
-
-// -----
-
llvm.func @distribute_allocate(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
// expected-error@below {{not yet implemented: Unhandled clause allocate in omp.distribute operation}}
// expected-error@below {{LLVM Translation failed for operation: omp.distribute}}
|
@llvm/pr-subscribers-mlir Author: Sergio Afonso (skatrak) ChangesThis patch adds support for translating composite Existing translation rules take care of creating a parallel region to hold the workshared and workdistributed loop. Full diff: https://github.com/llvm/llvm-project/pull/127819.diff 3 Files Affected:
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index c8221a9f9854a..7e8a9bdb5b133 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -260,10 +260,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
LogicalResult result = success();
llvm::TypeSwitch<Operation &>(op)
.Case([&](omp::DistributeOp op) {
- if (op.isComposite() &&
- isa_and_present<omp::WsloopOp>(op.getNestedWrapper()))
- result = op.emitError() << "not yet implemented: "
- "composite omp.distribute + omp.wsloop";
checkAllocate(op, result);
checkDistSchedule(op, result);
checkOrder(op, result);
@@ -1993,6 +1989,14 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
bool isSimd = wsloopOp.getScheduleSimd();
bool loopNeedsBarrier = !wsloopOp.getNowait();
+ // The only legal way for the direct parent to be omp.distribute is that this
+ // represents 'distribute parallel do'. Otherwise, this is a regular
+ // worksharing loop.
+ llvm::omp::WorksharingLoopType workshareLoopType =
+ llvm::isa_and_present<omp::DistributeOp>(opInst.getParentOp())
+ ? llvm::omp::WorksharingLoopType::DistributeForStaticLoop
+ : llvm::omp::WorksharingLoopType::ForStaticLoop;
+
llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
llvm::Expected<llvm::BasicBlock *> regionBlock = convertOmpOpRegions(
wsloopOp.getRegion(), "omp.wsloop.region", builder, moduleTranslation);
@@ -2008,7 +2012,8 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
ompLoc.DL, loopInfo, allocaIP, loopNeedsBarrier,
convertToScheduleKind(schedule), chunk, isSimd,
scheduleMod == omp::ScheduleModifier::monotonic,
- scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered);
+ scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered,
+ workshareLoopType);
if (failed(handleError(wsloopIP, opInst)))
return failure();
@@ -3792,6 +3797,12 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
return regionBlock.takeError();
builder.SetInsertPoint(*regionBlock, (*regionBlock)->begin());
+ // Skip applying a workshare loop below when translating 'distribute
+ // parallel do' (it's been already handled by this point while translating
+ // the nested omp.wsloop).
+ if (isa_and_present<omp::WsloopOp>(distributeOp.getNestedWrapper()))
+ return llvm::Error::success();
+
// TODO: Add support for clauses which are valid for DISTRIBUTE constructs.
// Static schedule is the default.
auto schedule = omp::ClauseScheduleKind::Static;
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index a5a490e527d79..d85b149c66811 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -3307,3 +3307,68 @@ llvm.func @distribute() {
// CHECK: store i64 1, ptr %[[STRIDE]]
// CHECK: %[[TID:.*]] = call i32 @__kmpc_global_thread_num({{.*}})
// CHECK: call void @__kmpc_for_static_init_{{.*}}(ptr @{{.*}}, i32 %[[TID]], i32 92, ptr %[[LASTITER]], ptr %[[LB]], ptr %[[UB]], ptr %[[STRIDE]], i64 1, i64 0)
+
+// -----
+
+llvm.func @distribute_wsloop(%lb : i32, %ub : i32, %step : i32) {
+ omp.parallel {
+ omp.distribute {
+ omp.wsloop {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ } {omp.composite}
+ } {omp.composite}
+ omp.terminator
+ } {omp.composite}
+ llvm.return
+}
+
+// CHECK-LABEL: define void @distribute_wsloop
+// CHECK: call void{{.*}}@__kmpc_fork_call({{.*}}, ptr @[[OUTLINED_PARALLEL:.*]],
+
+// CHECK: define internal void @[[OUTLINED_PARALLEL]]({{.*}})
+// CHECK: %[[ARGS:.*]] = alloca { i32, i32, i32, ptr, ptr, ptr, ptr }
+// CHECK: %[[LASTITER_ALLOC:.*]] = alloca i32
+// CHECK: %[[LB_ALLOC:.*]] = alloca i32
+// CHECK: %[[UB_ALLOC:.*]] = alloca i32
+// CHECK: %[[STRIDE_ALLOC:.*]] = alloca i32
+// CHECK: %[[LB_ARG:.*]] = getelementptr {{.*}}, ptr %[[ARGS]], i32 0, i32 3
+// CHECK: store ptr %[[LB_ALLOC]], ptr %[[LB_ARG]]
+// CHECK: %[[UB_ARG:.*]] = getelementptr {{.*}}, ptr %[[ARGS]], i32 0, i32 4
+// CHECK: store ptr %[[UB_ALLOC]], ptr %[[UB_ARG]]
+// CHECK: %[[STRIDE_ARG:.*]] = getelementptr {{.*}}, ptr %[[ARGS]], i32 0, i32 5
+// CHECK: store ptr %[[STRIDE_ALLOC]], ptr %[[STRIDE_ARG]]
+// CHECK: %[[LASTITER_ARG:.*]] = getelementptr {{.*}}, ptr %[[ARGS]], i32 0, i32 6
+// CHECK: store ptr %[[LASTITER_ALLOC]], ptr %[[LASTITER_ARG]]
+// CHECK: call void @[[OUTLINED_DISTRIBUTE:.*]](ptr %[[ARGS]])
+
+// CHECK: define internal void @[[OUTLINED_DISTRIBUTE]](ptr %[[ARGS_STRUCT:.*]])
+// CHECK: %[[LB_PTR:.*]] = getelementptr {{.*}}, ptr %[[ARGS_STRUCT]], i32 0, i32 3
+// CHECK: %[[LB:.*]] = load ptr, ptr %[[LB_PTR]]
+// CHECK: %[[UB_PTR:.*]] = getelementptr {{.*}}, ptr %[[ARGS_STRUCT]], i32 0, i32 4
+// CHECK: %[[UB:.*]] = load ptr, ptr %[[UB_PTR]]
+// CHECK: %[[STRIDE_PTR:.*]] = getelementptr {{.*}}, ptr %[[ARGS_STRUCT]], i32 0, i32 5
+// CHECK: %[[STRIDE:.*]] = load ptr, ptr %[[STRIDE_PTR]]
+// CHECK: %[[LASTITER_PTR:.*]] = getelementptr {{.*}}, ptr %[[ARGS_STRUCT]], i32 0, i32 6
+// CHECK: %[[LASTITER:.*]] = load ptr, ptr %[[LASTITER_PTR]]
+// CHECK: br label %[[DISTRIBUTE_BODY:.*]]
+
+// CHECK: [[DISTRIBUTE_BODY]]:
+// CHECK-NEXT: br label %[[DISTRIBUTE_REGION:.*]]
+
+// CHECK: [[DISTRIBUTE_REGION]]:
+// CHECK-NEXT: br label %[[WSLOOP_REGION:.*]]
+
+// CHECK: [[WSLOOP_REGION]]:
+// CHECK: %omp_loop.tripcount = select {{.*}}
+// CHECK-NEXT: br label %[[PREHEADER:.*]]
+
+// CHECK: [[PREHEADER]]:
+// CHECK: store i32 0, ptr %[[LB]]
+// CHECK: %[[TRIPCOUNT:.*]] = sub i32 %omp_loop.tripcount, 1
+// CHECK: store i32 %[[TRIPCOUNT]], ptr %[[UB]]
+// CHECK: store i32 1, ptr %[[STRIDE]]
+// CHECK: %[[TID:.*]] = call i32 @__kmpc_global_thread_num({{.*}})
+// CHECK: %[[DIST_UB:.*]] = alloca i32
+// CHECK: call void @__kmpc_dist_for_static_init_{{.*}}(ptr @{{.*}}, i32 %[[TID]], i32 34, ptr %[[LASTITER]], ptr %[[LB]], ptr %[[UB]], ptr %[[DIST_UB]], ptr %[[STRIDE]], i32 1, i32 0)
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 71dbc061c3104..d1c745af9bff5 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -66,25 +66,6 @@ llvm.func @do_simd(%lb : i32, %ub : i32, %step : i32) {
// -----
-llvm.func @distribute_wsloop(%lb : i32, %ub : i32, %step : i32) {
- // expected-error@below {{LLVM Translation failed for operation: omp.parallel}}
- omp.parallel {
- // expected-error@below {{not yet implemented: composite omp.distribute + omp.wsloop}}
- // expected-error@below {{LLVM Translation failed for operation: omp.distribute}}
- omp.distribute {
- omp.wsloop {
- omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
- omp.yield
- }
- } {omp.composite}
- } {omp.composite}
- omp.terminator
- } {omp.composite}
- llvm.return
-}
-
-// -----
-
llvm.func @distribute_allocate(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
// expected-error@below {{not yet implemented: Unhandled clause allocate in omp.distribute operation}}
// expected-error@below {{LLVM Translation failed for operation: omp.distribute}}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks!
cb7ae2d
to
dbe0d70
Compare
38ba269
to
33d5af4
Compare
dbe0d70
to
ba9ea8c
Compare
33d5af4
to
aad04fa
Compare
ba9ea8c
to
d164acf
Compare
1172e9b
to
e61c797
Compare
d164acf
to
6bd8d01
Compare
This patch adds support for translating composite `omp.parallel` + `omp.distribute` + `omp.wsloop` loops to LLVM IR on the host. This is done by passing an updated `WorksharingLoopType` to the call to `applyWorkshareLoop` associated to the lowering of the `omp.wsloop` operation, so that `__kmpc_dist_for_static_init` is called at runtime in place of `__kmpc_for_static_init`. Existing translation rules take care of creating a parallel region to hold the workshared and workdistributed loop.
e61c797
to
ac8b967
Compare
This patch adds support for translating composite
omp.parallel
+omp.distribute
+omp.wsloop
loops to LLVM IR on the host. This is done by passing an updatedWorksharingLoopType
to the call toapplyWorkshareLoop
associated to the lowering of theomp.wsloop
operation, so that__kmpc_dist_for_static_init
is called at runtime in place of__kmpc_for_static_init
.Existing translation rules take care of creating a parallel region to hold the workshared and workdistributed loop.