-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[VectorCombine] Add intrinsics handling to shuffleToIdentity #91000
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[VectorCombine] Add intrinsics handling to shuffleToIdentity #91000
Conversation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with a few minors
@@ -1729,7 +1729,9 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { | |||
return false; | |||
|
|||
// Look for an identity value. | |||
if (Item[0].second == 0 && Item[0].first->getType() == Ty && | |||
if (Item[0].second == 0 && | |||
cast<FixedVectorType>(Item[0].first->getType())->getNumElements() == |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we have any cases where this might not be FixedVectorType?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My understanding is that we should always remain as Vectors, and nothing will become scalable considering we are looking at shuffles and lanewise instructions.
@@ -1770,6 +1772,20 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { | |||
Worklist.push_back(GenerateInstLaneVectorFromOperand(Item, 1)); | |||
} else if (isa<UnaryOperator>(Item[0].first)) { | |||
Worklist.push_back(GenerateInstLaneVectorFromOperand(Item, 0)); | |||
} else if (auto *II = dyn_cast<IntrinsicInst>(Item[0].first); | |||
II && isTriviallyVectorizable(II->getIntrinsicID())) { | |||
for (unsigned O = 0; O < II->getNumOperands() - 1; O++) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for (unsigned O = 0, E = II->getNumOperands() - 1; O < E; O++) {
Also, why O? I've always avoided it due to O/0/o confusion, but that's me :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point :) It was just short-hard for the operation number and I perhaps made it too short. I'll switch it over to use Op.
This is probably the most involved addition, as it tries to make use of isTriviallyVectorizable with isVectorIntrinsicWithScalarOpAtArg to handle a number of different intrinsics that are all lane-wise. Additional tests have been added for some of the different intrinsics from isVectorIntrinsicWithScalarOpAtArg / isVectorIntrinsicWithOverloadTypeAtArg.
3d827ff
to
d4c0b23
Compare
@llvm/pr-subscribers-llvm-transforms Author: David Green (davemgreen) ChangesThis is probably the most involved addition, as it tries to make use of Full diff: https://github.com/llvm/llvm-project/pull/91000.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 8573a8adf53b..9d43fb4ab607 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1729,7 +1729,9 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
return false;
// Look for an identity value.
- if (Item[0].second == 0 && Item[0].first->getType() == Ty &&
+ if (Item[0].second == 0 &&
+ cast<FixedVectorType>(Item[0].first->getType())->getNumElements() ==
+ Ty->getNumElements() &&
all_of(drop_begin(enumerate(Item)), [&](const auto &E) {
return !E.value().first || (E.value().first == Item[0].first &&
E.value().second == (int)E.index());
@@ -1773,6 +1775,20 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
Worklist.push_back(GenerateInstLaneVectorFromOperand(Item, 1));
} else if (isa<UnaryOperator>(Item[0].first)) {
Worklist.push_back(GenerateInstLaneVectorFromOperand(Item, 0));
+ } else if (auto *II = dyn_cast<IntrinsicInst>(Item[0].first);
+ II && isTriviallyVectorizable(II->getIntrinsicID())) {
+ for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
+ if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op)) {
+ if (!all_of(drop_begin(Item), [&](InstLane &IL) {
+ return !IL.first ||
+ (cast<Instruction>(IL.first)->getOperand(Op) ==
+ cast<Instruction>(Item[0].first)->getOperand(Op));
+ }))
+ return false;
+ continue;
+ }
+ Worklist.push_back(GenerateInstLaneVectorFromOperand(Item, Op));
+ }
} else {
return false;
}
@@ -1799,13 +1815,24 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
}
auto *I = cast<Instruction>(Item[0].first);
- SmallVector<Value *> Ops(I->getNumOperands());
- for (unsigned Idx = 0, E = I->getNumOperands(); Idx < E; Idx++)
+ auto *II = dyn_cast<IntrinsicInst>(I);
+ unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
+ SmallVector<Value *> Ops(NumOps);
+ for (unsigned Idx = 0; Idx < NumOps; Idx++) {
+ if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx)) {
+ Ops[Idx] = II->getOperand(Idx);
+ continue;
+ }
Ops[Idx] = Generate(GenerateInstLaneVectorFromOperand(Item, Idx));
+ }
Builder.SetInsertPoint(I);
+ Type *DstTy = FixedVectorType::get(I->getType()->getScalarType(),
+ Ty->getNumElements());
if (auto BI = dyn_cast<BinaryOperator>(I))
return Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
Ops[0], Ops[1]);
+ if (II)
+ return Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
assert(isa<UnaryInstruction>(I) &&
"Unexpected instruction type in Generate");
return Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index 0fb634fce135..b58f92d70936 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -102,11 +102,7 @@ define <8 x half> @fneg(<8 x half> %a, <8 x half> %b) {
define <8 x i8> @abs(<8 x i8> %a) {
; CHECK-LABEL: @abs(
-; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[ABT:%.*]] = call <4 x i8> @llvm.abs.v4i8(<4 x i8> [[AT]], i1 false)
-; CHECK-NEXT: [[ABB:%.*]] = call <4 x i8> @llvm.abs.v4i8(<4 x i8> [[AB]], i1 false)
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i8> [[ABT]], <4 x i8> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[R:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[A:%.*]], i1 false)
; CHECK-NEXT: ret <8 x i8> [[R]]
;
%ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -119,11 +115,7 @@ define <8 x i8> @abs(<8 x i8> %a) {
define <8 x half> @powi(<8 x half> %a) {
; CHECK-LABEL: @powi(
-; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[ABT:%.*]] = call <4 x half> @llvm.powi.v4f16.i32(<4 x half> [[AT]], i32 10)
-; CHECK-NEXT: [[ABB:%.*]] = call <4 x half> @llvm.powi.v4f16.i32(<4 x half> [[AB]], i32 10)
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x half> [[ABT]], <4 x half> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[R:%.*]] = call <8 x half> @llvm.powi.v8f16.i32(<8 x half> [[A:%.*]], i32 10)
; CHECK-NEXT: ret <8 x half> [[R]]
;
%ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -171,11 +163,7 @@ declare <4 x half> @othercall(<4 x half>)
define <8 x i32> @lrint(<8 x half> %a) {
; CHECK-LABEL: @lrint(
-; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[ABT:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f16(<4 x half> [[AT]])
-; CHECK-NEXT: [[ABB:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f16(<4 x half> [[AB]])
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[ABT]], <4 x i32> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[R:%.*]] = call <8 x i32> @llvm.lrint.v8i32.v8f16(<8 x half> [[A:%.*]])
; CHECK-NEXT: ret <8 x i32> [[R]]
;
%ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -427,15 +415,7 @@ define <8 x i8> @icmpsel(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
define <8 x half> @fma(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
; CHECK-LABEL: @fma(
-; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[BB:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[BT:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[CB:%.*]] = shufflevector <8 x half> [[C:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[CT:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[ABB:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[AB]], <4 x half> [[BB]], <4 x half> [[CB]])
-; CHECK-NEXT: [[ABT:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[AT]], <4 x half> [[BT]], <4 x half> [[CT]])
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x half> [[ABT]], <4 x half> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[R:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]])
; CHECK-NEXT: ret <8 x half> [[R]]
;
%ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -487,19 +467,10 @@ define void @exttrunc(<8 x i32> %a, <8 x i32> %b, ptr %p) {
define <8 x i8> @intrinsics_minmax(<8 x i8> %a, <8 x i8> %b) {
; CHECK-LABEL: @intrinsics_minmax(
-; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[BB:%.*]] = shufflevector <8 x i8> [[B:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[BT:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[ABT:%.*]] = call <4 x i8> @llvm.smin.v4i8(<4 x i8> [[AT]], <4 x i8> [[BT]])
-; CHECK-NEXT: [[ABB:%.*]] = call <4 x i8> @llvm.smin.v4i8(<4 x i8> [[AB]], <4 x i8> [[BB]])
-; CHECK-NEXT: [[ABT1:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[ABT]], <4 x i8> [[BT]])
-; CHECK-NEXT: [[ABB1:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[ABB]], <4 x i8> [[BB]])
-; CHECK-NEXT: [[ABT2:%.*]] = call <4 x i8> @llvm.umin.v4i8(<4 x i8> [[ABT1]], <4 x i8> [[BT]])
-; CHECK-NEXT: [[ABB2:%.*]] = call <4 x i8> @llvm.umin.v4i8(<4 x i8> [[ABB1]], <4 x i8> [[BB]])
-; CHECK-NEXT: [[ABT3:%.*]] = call <4 x i8> @llvm.umax.v4i8(<4 x i8> [[ABT2]], <4 x i8> [[BT]])
-; CHECK-NEXT: [[ABB3:%.*]] = call <4 x i8> @llvm.umax.v4i8(<4 x i8> [[ABB2]], <4 x i8> [[BB]])
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i8> [[ABT3]], <4 x i8> [[ABB3]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.smin.v8i8(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[B]])
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[TMP2]], <8 x i8> [[B]])
+; CHECK-NEXT: [[R:%.*]] = call <8 x i8> @llvm.umax.v8i8(<8 x i8> [[TMP3]], <8 x i8> [[B]])
; CHECK-NEXT: ret <8 x i8> [[R]]
;
%ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -520,19 +491,10 @@ define <8 x i8> @intrinsics_minmax(<8 x i8> %a, <8 x i8> %b) {
define <8 x i8> @intrinsics_addsat(<8 x i8> %a, <8 x i8> %b) {
; CHECK-LABEL: @intrinsics_addsat(
-; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[BB:%.*]] = shufflevector <8 x i8> [[B:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[BT:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[ABT:%.*]] = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> [[AT]], <4 x i8> [[BT]])
-; CHECK-NEXT: [[ABB:%.*]] = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> [[AB]], <4 x i8> [[BB]])
-; CHECK-NEXT: [[ABT1:%.*]] = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> [[ABT]], <4 x i8> [[BT]])
-; CHECK-NEXT: [[ABB1:%.*]] = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> [[ABB]], <4 x i8> [[BB]])
-; CHECK-NEXT: [[ABT2:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[ABT1]], <4 x i8> [[BT]])
-; CHECK-NEXT: [[ABB2:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[ABB1]], <4 x i8> [[BB]])
-; CHECK-NEXT: [[ABT3:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[ABT2]], <4 x i8> [[BT]])
-; CHECK-NEXT: [[ABB3:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[ABB2]], <4 x i8> [[BB]])
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i8> [[ABT3]], <4 x i8> [[ABB3]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[B]])
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> [[TMP2]], <8 x i8> [[B]])
+; CHECK-NEXT: [[R:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> [[TMP3]], <8 x i8> [[B]])
; CHECK-NEXT: ret <8 x i8> [[R]]
;
%ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
|
This is probably the most involved addition, as it tries to make use of
isTriviallyVectorizable with isVectorIntrinsicWithScalarOpAtArg to handle a
number of different intrinsics that are all lane-wise. Additional tests have
been added for some of the different intrinsics from
isVectorIntrinsicWithScalarOpAtArg / isVectorIntrinsicWithOverloadTypeAtArg.