Skip to content

Commit 3bb969f

Browse files
authored
[flang] Inline hlfir.matmul[_transpose]. (#122821)
Inlining `hlfir.matmul` as `hlfir.eval_in_mem` does not allow to get rid of a temporary array in many cases, but it may still be much better allowing to: * Get rid of any overhead related to calling runtime MATMUL (such as descriptors creation). * Use CPU-specific vectorization cost model for matmul loops, which Fortran runtime cannot currently do. * Optimize matmul of known-size arrays by complete unrolling. One of the drawbacks of `hlfir.eval_in_mem` inlining is that the ops inside it with store memory effects block the current MLIR CSE, so I decided to run this inlining late in the pipeline. There is a source commen explaining the CSE issue in more detail. Straightforward inlining of `hlfir.matmul` as an `hlfir.elemental` is not good for performance, and I got performance regressions with it comparing to Fortran runtime implementation. I put it under an enigneering option for experiments. At the same time, inlining `hlfir.matmul_transpose` as `hlfir.elemental` seems to be a good approach, e.g. it allows getting rid of a temporay array in cases like: `A(:)=B(:)+MATMUL(TRANSPOSE(C(:,:)),D(:))`. This patch improves performance of galgel and tonto a little bit.
1 parent 2bfa7bc commit 3bb969f

File tree

10 files changed

+1183
-3
lines changed

10 files changed

+1183
-3
lines changed

flang/include/flang/Optimizer/Builder/FIRBuilder.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -804,6 +804,15 @@ elideLengthsAlreadyInType(mlir::Type type, mlir::ValueRange lenParams);
804804
/// Get the address space which should be used for allocas
805805
uint64_t getAllocaAddressSpace(mlir::DataLayout *dataLayout);
806806

807+
/// The two vectors of MLIR values have the following property:
808+
/// \p extents1[i] must have the same value as \p extents2[i]
809+
/// The function returns a new vector of MLIR values that preserves
810+
/// the same property vs \p extents1 and \p extents2, but allows
811+
/// more optimizations. For example, if extents1[j] is a known constant,
812+
/// and extents2[j] is not, then result[j] is the MLIR value extents1[j].
813+
llvm::SmallVector<mlir::Value> deduceOptimalExtents(mlir::ValueRange extents1,
814+
mlir::ValueRange extents2);
815+
807816
} // namespace fir::factory
808817

809818
#endif // FORTRAN_OPTIMIZER_BUILDER_FIRBUILDER_H

flang/include/flang/Optimizer/Builder/HLFIRTools.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,11 @@ genTypeAndKindConvert(mlir::Location loc, fir::FirOpBuilder &builder,
508508
hlfir::Entity source, mlir::Type toType,
509509
bool preserveLowerBounds);
510510

511+
/// A shortcut for loadTrivialScalar(getElementAt()),
512+
/// which designates and loads an element of an array.
513+
Entity loadElementAt(mlir::Location loc, fir::FirOpBuilder &builder,
514+
Entity entity, mlir::ValueRange oneBasedIndices);
515+
511516
} // namespace hlfir
512517

513518
#endif // FORTRAN_OPTIMIZER_BUILDER_HLFIRTOOLS_H

flang/include/flang/Optimizer/HLFIR/Passes.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,17 @@ def LowerHLFIROrderedAssignments : Pass<"lower-hlfir-ordered-assignments", "::ml
4343

4444
def SimplifyHLFIRIntrinsics : Pass<"simplify-hlfir-intrinsics"> {
4545
let summary = "Simplify HLFIR intrinsic operations that don't need to result in runtime calls";
46+
let options = [Option<"allowNewSideEffects", "allow-new-side-effects", "bool",
47+
/*default=*/"false",
48+
"If enabled, then the HLFIR operations simplification "
49+
"may introduce operations with side effects. "
50+
"For example, hlfir.matmul may be inlined as "
51+
"and hlfir.eval_in_mem with hlfir.assign inside it."
52+
"The hlfir.assign has a write effect on the memory "
53+
"argument of hlfir.eval_in_mem, which may block "
54+
"some existing MLIR transformations (e.g. CSE) "
55+
"that otherwise would have been possible across "
56+
"the hlfir.matmul.">];
4657
}
4758

4859
def InlineElementals : Pass<"inline-elementals"> {

flang/lib/Optimizer/Builder/FIRBuilder.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1740,3 +1740,17 @@ uint64_t fir::factory::getAllocaAddressSpace(mlir::DataLayout *dataLayout) {
17401740
return mlir::cast<mlir::IntegerAttr>(addrSpace).getUInt();
17411741
return 0;
17421742
}
1743+
1744+
llvm::SmallVector<mlir::Value>
1745+
fir::factory::deduceOptimalExtents(mlir::ValueRange extents1,
1746+
mlir::ValueRange extents2) {
1747+
llvm::SmallVector<mlir::Value> extents;
1748+
extents.reserve(extents1.size());
1749+
for (auto [extent1, extent2] : llvm::zip(extents1, extents2)) {
1750+
if (!fir::getIntIfConstant(extent1) && fir::getIntIfConstant(extent2))
1751+
extents.push_back(extent2);
1752+
else
1753+
extents.push_back(extent1);
1754+
}
1755+
return extents;
1756+
}

flang/lib/Optimizer/Builder/HLFIRTools.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -939,8 +939,10 @@ llvm::SmallVector<mlir::Value> hlfir::genLoopNestWithReductions(
939939
doLoop = builder.create<fir::DoLoopOp>(loc, one, ub, one, isUnordered,
940940
/*finalCountValue=*/false,
941941
parentLoop.getRegionIterArgs());
942-
// Return the results of the child loop from its parent loop.
943-
builder.create<fir::ResultOp>(loc, doLoop.getResults());
942+
if (!reductionInits.empty()) {
943+
// Return the results of the child loop from its parent loop.
944+
builder.create<fir::ResultOp>(loc, doLoop.getResults());
945+
}
944946
}
945947

946948
builder.setInsertionPointToStart(doLoop.getBody());
@@ -955,7 +957,8 @@ llvm::SmallVector<mlir::Value> hlfir::genLoopNestWithReductions(
955957
reductionValues =
956958
genBody(loc, builder, oneBasedIndices, parentLoop.getRegionIterArgs());
957959
builder.setInsertionPointToEnd(parentLoop.getBody());
958-
builder.create<fir::ResultOp>(loc, reductionValues);
960+
if (!reductionValues.empty())
961+
builder.create<fir::ResultOp>(loc, reductionValues);
959962
builder.setInsertionPointAfter(outerLoop);
960963
return outerLoop->getResults();
961964
}
@@ -1410,3 +1413,11 @@ void hlfir::computeEvaluateOpIn(mlir::Location loc, fir::FirOpBuilder &builder,
14101413
builder.clone(op, mapper);
14111414
return;
14121415
}
1416+
1417+
hlfir::Entity hlfir::loadElementAt(mlir::Location loc,
1418+
fir::FirOpBuilder &builder,
1419+
hlfir::Entity entity,
1420+
mlir::ValueRange oneBasedIndices) {
1421+
return loadTrivialScalar(loc, builder,
1422+
getElementAt(loc, builder, entity, oneBasedIndices));
1423+
}

0 commit comments

Comments
 (0)