-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SLP][REVEC] Expand getelementptr into vector form. #103704
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP][REVEC] Expand getelementptr into vector form. #103704
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Han-Kuan Chen (HanKuanChen) ChangesFull diff: https://github.com/llvm/llvm-project/pull/103704.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index feffd9ae3c99b7..66d3b6a0036f4c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13760,6 +13760,26 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
+ if (isa<FixedVectorType>(ScalarTy)) {
+ assert(SLPReVec && "FixedVectorType is not expected.");
+ // CreateMaskedGather expects VecTy and VecPtr have same size. We need
+ // to expand VecPtr if ScalarTy is a vector type.
+ unsigned ScalarTyNumElements =
+ cast<FixedVectorType>(ScalarTy)->getNumElements();
+ unsigned VecTyNumElements =
+ cast<FixedVectorType>(VecTy)->getNumElements();
+ SmallVector<Constant *> Indices(VecTyNumElements);
+ transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
+ return Builder.getInt64(I % ScalarTyNumElements);
+ });
+ unsigned VF =
+ cast<FixedVectorType>(VecPtr->getType())->getNumElements();
+ VecPtr = Builder.CreateGEP(
+ VecTy->getElementType(),
+ Builder.CreateShuffleVector(
+ VecPtr, createReplicatedMask(VecTyNumElements / VF, VF)),
+ ConstantVector::get(Indices));
+ }
// Use the minimum alignment of the gathered loads.
Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
new file mode 100644
index 00000000000000..d822a24220df2c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s
+
+define i32 @test() {
+; CHECK-LABEL: @test(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[IF_END_I87:%.*]]
+; CHECK: if.end.i87:
+; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> <ptr inttoptr (i64 64036 to ptr), ptr inttoptr (i64 64036 to ptr), ptr inttoptr (i64 64064 to ptr), ptr inttoptr (i64 64064 to ptr)>, <4 x i64> <i64 0, i64 1, i64 0, i64 1>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> poison, i64 0)
+; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP1]], <2 x i32> zeroinitializer, i64 2)
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT: switch i32 0, label [[SW_BB509_I:%.*]] [
+; CHECK-NEXT: i32 1, label [[SW_BB509_I]]
+; CHECK-NEXT: i32 0, label [[IF_THEN458_I:%.*]]
+; CHECK-NEXT: ]
+; CHECK: if.then458.i:
+; CHECK-NEXT: br label [[SW_BB509_I]]
+; CHECK: sw.bb509.i:
+; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i32> [ [[TMP0]], [[IF_THEN458_I]] ], [ [[TMP3]], [[IF_END_I87]] ], [ [[TMP3]], [[IF_END_I87]] ]
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ %getelementptr0 = getelementptr i8, ptr null, i64 64036
+ %getelementptr1 = getelementptr i8, ptr null, i64 64064
+ br label %if.end.i87
+
+if.end.i87: ; preds = %entry
+ %0 = load <2 x i32>, ptr %getelementptr0, align 4
+ %1 = load <2 x i32>, ptr %getelementptr1, align 8
+ switch i32 0, label %sw.bb509.i [
+ i32 1, label %sw.bb509.i
+ i32 0, label %if.then458.i
+ ]
+
+if.then458.i: ; preds = %if.end.i87
+ br label %sw.bb509.i
+
+sw.bb509.i: ; preds = %if.then458.i, %if.end.i87, %if.end.i87
+ %4 = phi <2 x i32> [ %0, %if.then458.i ], [ %0, %if.end.i87 ], [ %0, %if.end.i87 ]
+ %5 = phi <2 x i32> [ %1, %if.then458.i ], [ zeroinitializer, %if.end.i87 ], [ zeroinitializer, %if.end.i87 ]
+ ret i32 0
+}
|
transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) { | ||
return Builder.getInt64(I % ScalarTyNumElements); | ||
}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are we rereading the same data here several times? Can it be replaced with subvector insert instead?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Like this dea6a00?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, I mean read the data once and then just copy it several times in IR
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think we can. Each time the ScalarTyNumElements is different.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why? It may vary between calls, but it is not modified here. Instead of re-reading the same data several times, better just read it once and then insert subvector(s)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, do you have the cost model for this change?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not represent it as 2 loads + insert?
This test has -slp-threshold=-100
. Basically, scalar version (loads + insert) cost is smaller than vector version (masked gather) cost.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, do you have the cost model for this change?
GEPs are calculated by TTI.getGEPCost
in getGEPCosts
.
TTI->getGatherScatterOpCost
correctly calculates the cost if ScalarTy is FixedVectorType.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not represent it as 2 loads + insert?
This test has
-slp-threshold=-100
. Basically, scalar version (loads + insert) cost is smaller than vector version (masked gather) cost.
By scalar
you mean small vector <2 x >
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes.
✅ With the latest revision this PR passed the C/C++ code formatter. |
63f1f37
to
dea6a00
Compare
This reverts commit dea6a00.
; CHECK-NEXT: br label [[IF_END_I87:%.*]] | ||
; CHECK: if.end.i87: | ||
; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> <ptr inttoptr (i64 64036 to ptr), ptr inttoptr (i64 64036 to ptr), ptr inttoptr (i64 64064 to ptr), ptr inttoptr (i64 64064 to ptr)>, <4 x i64> <i64 0, i64 1, i64 0, i64 1>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison) | ||
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> poison, i64 0) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why inserting poison here?
transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) { | ||
return Builder.getInt64(I % ScalarTyNumElements); | ||
}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not represent it as 2 loads + insert?
This test has
-slp-threshold=-100
. Basically, scalar version (loads + insert) cost is smaller than vector version (masked gather) cost.
By scalar
you mean small vector <2 x >
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG
No description provided.