-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[mlir][LLVMIR][OpenMP] Move reduction allocas before stores #105683
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[mlir][LLVMIR][OpenMP] Move reduction allocas before stores #105683
Conversation
Delay implicit reduction var stores until after all allocas. This fixes a couple of cases in existing unit tests.
@llvm/pr-subscribers-flang-openmp @llvm/pr-subscribers-flang-fir-hlfir Author: Tom Eccles (tblah) ChangesDelay implicit reduction var stores until after all allocas. This fixes a couple of cases in existing unit tests. Follow up to #102524 Full diff: https://github.com/llvm/llvm-project/pull/105683.diff 3 Files Affected:
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90 b/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90
index 1457be05ca1025..262075ec9b25d0 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90
@@ -26,10 +26,10 @@ end subroutine proc
!CHECK: store i32 %[[TID]], ptr %[[TID_LOCAL]]
!CHECK: %[[F_priv:.*]] = alloca ptr
!CHECK: %[[I_priv:.*]] = alloca i32
-!CHECK: store ptr %{{.*}}, ptr %[[F_priv]]
!CHECK: omp.reduction.init:
-!CHECK: store i32 0, ptr %[[I_priv]]
+!CHECK: store ptr %{{.*}}, ptr %[[F_priv]]
+!CHECK: store i32 0, ptr %[[I_priv]]
!CHECK: omp.par.region:
!CHECK: br label %[[MALLOC_BB:.*]]
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6d14d77c440e67..b64bd7d45a7705 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -593,22 +593,23 @@ convertOmpOrderedRegion(Operation &opInst, llvm::IRBuilderBase &builder,
}
/// Allocate space for privatized reduction variables.
+/// `deferredStores` contains information to create store operations which needs
+/// to be inserted after all allocas
template <typename T>
-static LogicalResult
-allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
- llvm::IRBuilderBase &builder,
- LLVM::ModuleTranslation &moduleTranslation,
- llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
- SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
- SmallVectorImpl<llvm::Value *> &privateReductionVariables,
- DenseMap<Value, llvm::Value *> &reductionVariableMap,
- llvm::ArrayRef<bool> isByRefs) {
+static LogicalResult allocReductionVars(
+ T loop, ArrayRef<BlockArgument> reductionArgs, llvm::IRBuilderBase &builder,
+ LLVM::ModuleTranslation &moduleTranslation,
+ llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
+ SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
+ SmallVectorImpl<llvm::Value *> &privateReductionVariables,
+ DenseMap<Value, llvm::Value *> &reductionVariableMap,
+ SmallVectorImpl<std::pair<llvm::Value *, llvm::Value *>> &deferredStores,
+ llvm::ArrayRef<bool> isByRefs) {
llvm::IRBuilderBase::InsertPointGuard guard(builder);
builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
// delay creating stores until after all allocas
- SmallVector<std::pair<llvm::Value *, llvm::Value *>> storesToCreate;
- storesToCreate.reserve(loop.getNumReductionVars());
+ deferredStores.reserve(loop.getNumReductionVars());
for (std::size_t i = 0; i < loop.getNumReductionVars(); ++i) {
Region &allocRegion = reductionDecls[i].getAllocRegion();
@@ -628,7 +629,7 @@ allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
// variable allocated in the inlined region)
llvm::Value *var = builder.CreateAlloca(
moduleTranslation.convertType(reductionDecls[i].getType()));
- storesToCreate.emplace_back(phis[0], var);
+ deferredStores.emplace_back(phis[0], var);
privateReductionVariables[i] = var;
moduleTranslation.mapValue(reductionArgs[i], phis[0]);
@@ -644,10 +645,6 @@ allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
}
}
- // TODO: further delay this so it doesn't come in the entry block at all
- for (auto [data, addr] : storesToCreate)
- builder.CreateStore(data, addr);
-
return success();
}
@@ -819,12 +816,19 @@ static LogicalResult allocAndInitializeReductionVars(
if (op.getNumReductionVars() == 0)
return success();
+ SmallVector<std::pair<llvm::Value *, llvm::Value *>> deferredStores;
+
if (failed(allocReductionVars(op, reductionArgs, builder, moduleTranslation,
allocaIP, reductionDecls,
privateReductionVariables, reductionVariableMap,
- isByRef)))
+ deferredStores, isByRef)))
return failure();
+ // store result of the alloc region to the allocated pointer to the real
+ // reduction variable
+ for (auto [data, addr] : deferredStores)
+ builder.CreateStore(data, addr);
+
// Before the loop, store the initial values of reductions into reduction
// variables. Although this could be done after allocas, we don't want to mess
// up with the alloca insertion point.
@@ -1359,6 +1363,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
collectReductionDecls(opInst, reductionDecls);
SmallVector<llvm::Value *> privateReductionVariables(
opInst.getNumReductionVars());
+ SmallVector<std::pair<llvm::Value *, llvm::Value *>> deferredStores;
auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
// Allocate reduction vars
@@ -1373,10 +1378,10 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
InsertPointTy(allocaIP.getBlock(),
allocaIP.getBlock()->getTerminator()->getIterator());
- if (failed(allocReductionVars(opInst, reductionArgs, builder,
- moduleTranslation, allocaIP, reductionDecls,
- privateReductionVariables,
- reductionVariableMap, isByRef)))
+ if (failed(allocReductionVars(
+ opInst, reductionArgs, builder, moduleTranslation, allocaIP,
+ reductionDecls, privateReductionVariables, reductionVariableMap,
+ deferredStores, isByRef)))
bodyGenStatus = failure();
// Initialize reduction vars
@@ -1401,6 +1406,12 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
builder.SetInsertPoint(initBlock->getFirstNonPHIOrDbgOrAlloca());
+ // insert stores deferred until after all allocas
+ // these store the results of the alloc region into the allocation for the
+ // pointer to the reduction variable
+ for (auto [data, addr] : deferredStores)
+ builder.CreateStore(data, addr);
+
for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
SmallVector<llvm::Value *> phis;
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
index da6f9430046123..2d8a13ccd2a1f5 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
@@ -89,7 +89,6 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
// CHECK: %[[VAL_13:.*]] = load i32, ptr %[[VAL_10]], align 4
// CHECK: %[[VAL_20:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
// CHECK: %[[VAL_21:.*]] = alloca ptr, align 8
-// CHECK: store ptr %[[VAL_20]], ptr %[[VAL_21]], align 8
// CHECK: %[[VAL_14:.*]] = alloca [1 x ptr], align 8
// CHECK: br label %[[VAL_15:.*]]
// CHECK: omp.reduction.init: ; preds = %[[VAL_16:.*]]
@@ -98,6 +97,7 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
// CHECK: br label %[[VAL_18:.*]]
// CHECK: omp.par.region1: ; preds = %[[VAL_17]]
// CHECK: %[[VAL_19:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
+// CHECK: store ptr %[[VAL_20]], ptr %[[VAL_21]], align 8
// CHECK: br label %[[VAL_22:.*]]
// CHECK: omp_section_loop.preheader: ; preds = %[[VAL_18]]
// CHECK: store i32 0, ptr %[[VAL_7]], align 4
|
@llvm/pr-subscribers-mlir Author: Tom Eccles (tblah) ChangesDelay implicit reduction var stores until after all allocas. This fixes a couple of cases in existing unit tests. Follow up to #102524 Full diff: https://github.com/llvm/llvm-project/pull/105683.diff 3 Files Affected:
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90 b/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90
index 1457be05ca1025..262075ec9b25d0 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90
@@ -26,10 +26,10 @@ end subroutine proc
!CHECK: store i32 %[[TID]], ptr %[[TID_LOCAL]]
!CHECK: %[[F_priv:.*]] = alloca ptr
!CHECK: %[[I_priv:.*]] = alloca i32
-!CHECK: store ptr %{{.*}}, ptr %[[F_priv]]
!CHECK: omp.reduction.init:
-!CHECK: store i32 0, ptr %[[I_priv]]
+!CHECK: store ptr %{{.*}}, ptr %[[F_priv]]
+!CHECK: store i32 0, ptr %[[I_priv]]
!CHECK: omp.par.region:
!CHECK: br label %[[MALLOC_BB:.*]]
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6d14d77c440e67..b64bd7d45a7705 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -593,22 +593,23 @@ convertOmpOrderedRegion(Operation &opInst, llvm::IRBuilderBase &builder,
}
/// Allocate space for privatized reduction variables.
+/// `deferredStores` contains information to create store operations which needs
+/// to be inserted after all allocas
template <typename T>
-static LogicalResult
-allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
- llvm::IRBuilderBase &builder,
- LLVM::ModuleTranslation &moduleTranslation,
- llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
- SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
- SmallVectorImpl<llvm::Value *> &privateReductionVariables,
- DenseMap<Value, llvm::Value *> &reductionVariableMap,
- llvm::ArrayRef<bool> isByRefs) {
+static LogicalResult allocReductionVars(
+ T loop, ArrayRef<BlockArgument> reductionArgs, llvm::IRBuilderBase &builder,
+ LLVM::ModuleTranslation &moduleTranslation,
+ llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
+ SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
+ SmallVectorImpl<llvm::Value *> &privateReductionVariables,
+ DenseMap<Value, llvm::Value *> &reductionVariableMap,
+ SmallVectorImpl<std::pair<llvm::Value *, llvm::Value *>> &deferredStores,
+ llvm::ArrayRef<bool> isByRefs) {
llvm::IRBuilderBase::InsertPointGuard guard(builder);
builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
// delay creating stores until after all allocas
- SmallVector<std::pair<llvm::Value *, llvm::Value *>> storesToCreate;
- storesToCreate.reserve(loop.getNumReductionVars());
+ deferredStores.reserve(loop.getNumReductionVars());
for (std::size_t i = 0; i < loop.getNumReductionVars(); ++i) {
Region &allocRegion = reductionDecls[i].getAllocRegion();
@@ -628,7 +629,7 @@ allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
// variable allocated in the inlined region)
llvm::Value *var = builder.CreateAlloca(
moduleTranslation.convertType(reductionDecls[i].getType()));
- storesToCreate.emplace_back(phis[0], var);
+ deferredStores.emplace_back(phis[0], var);
privateReductionVariables[i] = var;
moduleTranslation.mapValue(reductionArgs[i], phis[0]);
@@ -644,10 +645,6 @@ allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
}
}
- // TODO: further delay this so it doesn't come in the entry block at all
- for (auto [data, addr] : storesToCreate)
- builder.CreateStore(data, addr);
-
return success();
}
@@ -819,12 +816,19 @@ static LogicalResult allocAndInitializeReductionVars(
if (op.getNumReductionVars() == 0)
return success();
+ SmallVector<std::pair<llvm::Value *, llvm::Value *>> deferredStores;
+
if (failed(allocReductionVars(op, reductionArgs, builder, moduleTranslation,
allocaIP, reductionDecls,
privateReductionVariables, reductionVariableMap,
- isByRef)))
+ deferredStores, isByRef)))
return failure();
+ // store result of the alloc region to the allocated pointer to the real
+ // reduction variable
+ for (auto [data, addr] : deferredStores)
+ builder.CreateStore(data, addr);
+
// Before the loop, store the initial values of reductions into reduction
// variables. Although this could be done after allocas, we don't want to mess
// up with the alloca insertion point.
@@ -1359,6 +1363,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
collectReductionDecls(opInst, reductionDecls);
SmallVector<llvm::Value *> privateReductionVariables(
opInst.getNumReductionVars());
+ SmallVector<std::pair<llvm::Value *, llvm::Value *>> deferredStores;
auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
// Allocate reduction vars
@@ -1373,10 +1378,10 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
InsertPointTy(allocaIP.getBlock(),
allocaIP.getBlock()->getTerminator()->getIterator());
- if (failed(allocReductionVars(opInst, reductionArgs, builder,
- moduleTranslation, allocaIP, reductionDecls,
- privateReductionVariables,
- reductionVariableMap, isByRef)))
+ if (failed(allocReductionVars(
+ opInst, reductionArgs, builder, moduleTranslation, allocaIP,
+ reductionDecls, privateReductionVariables, reductionVariableMap,
+ deferredStores, isByRef)))
bodyGenStatus = failure();
// Initialize reduction vars
@@ -1401,6 +1406,12 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
builder.SetInsertPoint(initBlock->getFirstNonPHIOrDbgOrAlloca());
+ // insert stores deferred until after all allocas
+ // these store the results of the alloc region into the allocation for the
+ // pointer to the reduction variable
+ for (auto [data, addr] : deferredStores)
+ builder.CreateStore(data, addr);
+
for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
SmallVector<llvm::Value *> phis;
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
index da6f9430046123..2d8a13ccd2a1f5 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
@@ -89,7 +89,6 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
// CHECK: %[[VAL_13:.*]] = load i32, ptr %[[VAL_10]], align 4
// CHECK: %[[VAL_20:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
// CHECK: %[[VAL_21:.*]] = alloca ptr, align 8
-// CHECK: store ptr %[[VAL_20]], ptr %[[VAL_21]], align 8
// CHECK: %[[VAL_14:.*]] = alloca [1 x ptr], align 8
// CHECK: br label %[[VAL_15:.*]]
// CHECK: omp.reduction.init: ; preds = %[[VAL_16:.*]]
@@ -98,6 +97,7 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
// CHECK: br label %[[VAL_18:.*]]
// CHECK: omp.par.region1: ; preds = %[[VAL_17]]
// CHECK: %[[VAL_19:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
+// CHECK: store ptr %[[VAL_20]], ptr %[[VAL_21]], align 8
// CHECK: br label %[[VAL_22:.*]]
// CHECK: omp_section_loop.preheader: ; preds = %[[VAL_18]]
// CHECK: store i32 0, ptr %[[VAL_7]], align 4
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for this work, LGTM. Just a small suggestion from me.
SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls, | ||
SmallVectorImpl<llvm::Value *> &privateReductionVariables, | ||
DenseMap<Value, llvm::Value *> &reductionVariableMap, | ||
SmallVectorImpl<std::pair<llvm::Value *, llvm::Value *>> &deferredStores, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: Feel free to disagree, but I think it would help readability to define a simple struct to hold elements of this vector, so that we can refer to the value
and address
of the deferred store rather than some undocumented pair element. Mainly because now it crosses function boundaries, so keeping track of what's what is more difficult.
Delay implicit reduction var stores until after all allocas. This fixes a couple of cases in existing unit tests.
Follow up to #102524