Skip to content

[OpenMP][MLIR] Support LLVM translation for distribute with delayed privatization #131564

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 62 additions & 25 deletions mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
checkAllocate(op, result);
checkDistSchedule(op, result);
checkOrder(op, result);
checkPrivate(op, result);
})
.Case([&](omp::OrderedRegionOp op) { checkParLevelSimd(op, result); })
.Case([&](omp::SectionsOp op) {
Expand Down Expand Up @@ -4188,6 +4187,38 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
// DistributeOp has only one region associated with it.
builder.restoreIP(codeGenIP);

// TODO This is a recurring pattern in almost all ops that need
// privatization. Try to abstract it in a shared util/interface.
MutableArrayRef<BlockArgument> privateBlockArgs =
cast<omp::BlockArgOpenMPOpInterface>(*distributeOp)
.getPrivateBlockArgs();
SmallVector<mlir::Value> mlirPrivateVars;
SmallVector<llvm::Value *> llvmPrivateVars;
SmallVector<omp::PrivateClauseOp> privateDecls;
mlirPrivateVars.reserve(privateBlockArgs.size());
llvmPrivateVars.reserve(privateBlockArgs.size());
collectPrivatizationDecls(distributeOp, privateDecls);

for (mlir::Value privateVar : distributeOp.getPrivateVars())
mlirPrivateVars.push_back(privateVar);

llvm::Expected<llvm::BasicBlock *> afterAllocas = allocatePrivateVars(
builder, moduleTranslation, privateBlockArgs, privateDecls,
mlirPrivateVars, llvmPrivateVars, allocaIP);
if (handleError(afterAllocas, opInst).failed())
return llvm::make_error<PreviouslyReportedError>();

if (handleError(initPrivateVars(builder, moduleTranslation,
privateBlockArgs, privateDecls,
mlirPrivateVars, llvmPrivateVars),
opInst)
.failed())
return llvm::make_error<PreviouslyReportedError>();

if (failed(copyFirstPrivateVars(builder, moduleTranslation, mlirPrivateVars,
llvmPrivateVars, privateDecls)))
return llvm::make_error<PreviouslyReportedError>();

llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
llvm::Expected<llvm::BasicBlock *> regionBlock =
Expand All @@ -4200,31 +4231,37 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
// Skip applying a workshare loop below when translating 'distribute
// parallel do' (it's been already handled by this point while translating
// the nested omp.wsloop).
if (isa_and_present<omp::WsloopOp>(distributeOp.getNestedWrapper()))
return llvm::Error::success();
if (!isa_and_present<omp::WsloopOp>(distributeOp.getNestedWrapper())) {
// TODO: Add support for clauses which are valid for DISTRIBUTE
// constructs. Static schedule is the default.
auto schedule = omp::ClauseScheduleKind::Static;
bool isOrdered = false;
std::optional<omp::ScheduleModifier> scheduleMod;
bool isSimd = false;
llvm::omp::WorksharingLoopType workshareLoopType =
llvm::omp::WorksharingLoopType::DistributeStaticLoop;
bool loopNeedsBarrier = false;
llvm::Value *chunk = nullptr;

llvm::CanonicalLoopInfo *loopInfo =
findCurrentLoopInfo(moduleTranslation);
llvm::OpenMPIRBuilder::InsertPointOrErrorTy wsloopIP =
ompBuilder->applyWorkshareLoop(
ompLoc.DL, loopInfo, allocaIP, loopNeedsBarrier,
convertToScheduleKind(schedule), chunk, isSimd,
scheduleMod == omp::ScheduleModifier::monotonic,
scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered,
workshareLoopType);

if (!wsloopIP)
return wsloopIP.takeError();
}

if (failed(cleanupPrivateVars(builder, moduleTranslation,
distributeOp.getLoc(), llvmPrivateVars,
privateDecls)))
return llvm::make_error<PreviouslyReportedError>();

// TODO: Add support for clauses which are valid for DISTRIBUTE constructs.
// Static schedule is the default.
auto schedule = omp::ClauseScheduleKind::Static;
bool isOrdered = false;
std::optional<omp::ScheduleModifier> scheduleMod;
bool isSimd = false;
llvm::omp::WorksharingLoopType workshareLoopType =
llvm::omp::WorksharingLoopType::DistributeStaticLoop;
bool loopNeedsBarrier = false;
llvm::Value *chunk = nullptr;

llvm::CanonicalLoopInfo *loopInfo = findCurrentLoopInfo(moduleTranslation);
llvm::OpenMPIRBuilder::InsertPointOrErrorTy wsloopIP =
ompBuilder->applyWorkshareLoop(
ompLoc.DL, loopInfo, allocaIP, loopNeedsBarrier,
convertToScheduleKind(schedule), chunk, isSimd,
scheduleMod == omp::ScheduleModifier::monotonic,
scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered,
workshareLoopType);

if (!wsloopIP)
return wsloopIP.takeError();
return llvm::Error::success();
};

Expand Down
106 changes: 106 additions & 0 deletions mlir/test/Target/LLVMIR/openmp-distribute-private.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
// Test code-gen for `omp.distribute` ops with delayed privatizers (i.e. using
// `omp.private` ops).

// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s

omp.private {type = private} @_QFEi_private_i32 : i32
omp.private {type = private} @_QFEpriv_val_dist_private_f32 : f32

llvm.func @_QQmain() {
%0 = llvm.mlir.constant(1 : i64) : i64
%1 = llvm.alloca %0 x f32 {bindc_name = "priv_val_dist"} : (i64) -> !llvm.ptr
%3 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
%4 = llvm.mlir.constant(3.140000e+00 : f32) : f32
%5 = llvm.mlir.constant(1000 : i32) : i32
%6 = llvm.mlir.constant(1 : i32) : i32

omp.teams {
omp.distribute private(@_QFEpriv_val_dist_private_f32 %1 -> %arg0, @_QFEi_private_i32 %3 -> %arg1 : !llvm.ptr, !llvm.ptr) {
omp.loop_nest (%arg2) : i32 = (%6) to (%5) inclusive step (%6) {
llvm.store %arg2, %arg1 : i32, !llvm.ptr
llvm.store %4, %arg0 : f32, !llvm.ptr
omp.yield
}
}
omp.terminator
}

llvm.return
}

// CHECK-LABEL: define void @_QQmain() {
// CHECK: call void {{.*}} @__kmpc_fork_teams(ptr @{{.*}}, i32 0, ptr @[[TEAMS_FUNC:.*]])
// CHECK-NEXT: br label %teams.exit
// CHECK: }

// CHECK: define internal void @[[TEAMS_FUNC]]({{.*}}) {
// CHECK: call void @[[DIST_FUNC:.*]]()
// CHECK-NEXT: br label %distribute.exit
// CHECK: }

// CHECK: define internal void @[[DIST_FUNC]]() {
// CHECK: %[[PRIV_VAR_ALLOC:.*]] = alloca float, align 4
// CHECK: %[[IV_ALLOC:.*]] = alloca i32, align 4

// CHECK: omp.loop_nest.region:
// CHECK-NEXT: store i32 %{{.*}}, ptr %[[IV_ALLOC]], align 4
// CHECK-NEXT: store float 0x40091EB860000000, ptr %[[PRIV_VAR_ALLOC]], align 4
// CHECK: }

// -----

llvm.func @foo_free(!llvm.ptr)

omp.private {type = firstprivate} @_QFEpriv_val_dist_firstprivate_f32 : f32 copy {
^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
%0 = llvm.load %arg0 : !llvm.ptr -> f32
llvm.store %0, %arg1 : f32, !llvm.ptr
omp.yield(%arg1 : !llvm.ptr)
} dealloc {
^bb0(%arg0: !llvm.ptr):
llvm.call @foo_free(%arg0) : (!llvm.ptr) -> ()
omp.yield
}

llvm.func @_QQmain() {
%0 = llvm.mlir.constant(1 : i64) : i64
%1 = llvm.alloca %0 x f32 {bindc_name = "priv_val_dist"} : (i64) -> !llvm.ptr
%4 = llvm.mlir.constant(3.140000e+00 : f32) : f32
%6 = llvm.mlir.constant(1 : i32) : i32
omp.distribute private(@_QFEpriv_val_dist_firstprivate_f32 %1 -> %arg0 : !llvm.ptr) {
omp.loop_nest (%arg2) : i32 = (%6) to (%6) inclusive step (%6) {
llvm.store %4, %arg0 : f32, !llvm.ptr
omp.yield
}
}
llvm.return
}

// CHECK-LABEL: define void @_QQmain() {
// CHECK: %[[SHARED_VAR_ALLOC:.*]] = alloca float, i64 1, align 4
// CHECK: %[[SHARED_VAR_PTR:.*]] = getelementptr { ptr }, ptr %[[DIST_PARAM:.*]], i32 0, i32 0
// CHECK: store ptr %[[SHARED_VAR_ALLOC]], ptr %[[SHARED_VAR_PTR]], align 8
// CHECK: call void @[[DIST_FUNC:.*]](ptr %[[DIST_PARAM]])
// CHECK-NEXT: br label %distribute.exit
// CHECK: }

// CHECK: define internal void @[[DIST_FUNC]](ptr %[[DIST_ARG:.*]]) {
// CHECK: %[[SHARED_VAR_GEP:.*]] = getelementptr { ptr }, ptr %[[DIST_ARG]], i32 0, i32 0
// CHECK: %[[SHARED_VAR_PTR2:.*]] = load ptr, ptr %[[SHARED_VAR_GEP]], align 8
// CHECK: %[[PRIV_VAR_ALLOC:.*]] = alloca float, align 4

// CHECK: omp.private.copy:
// CHECK-NEXT: %[[SHARED_VAR_VAL:.*]] = load float, ptr %[[SHARED_VAR_PTR2]], align 4
// CHECK-NEXT: store float %[[SHARED_VAR_VAL]], ptr %[[PRIV_VAR_ALLOC]], align 4

// CHECK: omp_loop.after:
// CHECK-NEXT: br label %omp.region.cont

// CHECK: omp.region.cont:
// CHECK-NEXT: call void @foo_free(ptr %[[PRIV_VAR_ALLOC]])

// CHECK: omp.loop_nest.region:
// CHECK-NEXT: store float 0x40091EB860000000, ptr %[[PRIV_VAR_ALLOC]], align 4
// CHECK: }


15 changes: 0 additions & 15 deletions mlir/test/Target/LLVMIR/openmp-todo.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -105,21 +105,6 @@ llvm.func @distribute_order(%lb : i32, %ub : i32, %step : i32) {

// -----

omp.private {type = private} @x.privatizer : !llvm.ptr

llvm.func @distribute_private(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
// expected-error@below {{not yet implemented: Unhandled clause privatization in omp.distribute operation}}
// expected-error@below {{LLVM Translation failed for operation: omp.distribute}}
omp.distribute private(@x.privatizer %x -> %arg0 : !llvm.ptr) {
omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
omp.yield
}
}
llvm.return
}

// -----

llvm.func @ordered_region_par_level_simd() {
// expected-error@below {{not yet implemented: Unhandled clause parallelization-level in omp.ordered.region operation}}
// expected-error@below {{LLVM Translation failed for operation: omp.ordered.region}}
Expand Down