-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[InstCombine] Optimistically allow multiple shufflevector uses in foldOpPhi #114278
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Matthias Braun (MatzeB) ChangesWe would like to optimize situations of the form that happen after loop vectorization+SROA:
where the interleave and de-interleave shuffle operations cancel each other out. This extends Full diff: https://github.com/llvm/llvm-project/pull/114278.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 2a54390c0f1882..9b2074e481f54f 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1773,17 +1773,33 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
if (NumPHIValues == 0)
return nullptr;
- // We normally only transform phis with a single use. However, if a PHI has
- // multiple uses and they are all the same operation, we can fold *all* of the
- // uses into the PHI.
+ // We normally only transform phis with a single use.
+ bool AllUsesIdentical = false;
+ bool MultipleUses = false;
if (!PN->hasOneUse()) {
- // Walk the use list for the instruction, comparing them to I.
+ // Exceptions:
+ // - All uses are identical.
+ // - All uses are shufflevector instructions that fully simplify; this
+ // helps interleave -> phi -> 2x de-interleave+de patterns.
+ if (isa<ShuffleVectorInst>(I)) {
+ MultipleUses = true;
+ }
+ AllUsesIdentical = true;
+ unsigned NumUses = 0;
for (User *U : PN->users()) {
+ ++NumUses;
Instruction *UI = cast<Instruction>(U);
- if (UI != &I && !I.isIdenticalTo(UI))
+ if (UI == &I)
+ continue;
+
+ if (!I.isIdenticalTo(UI))
+ AllUsesIdentical = false;
+ // Only inspect first 4 uses to avoid quadratic complexity.
+ if (!isa<ShuffleVectorInst>(UI) || NumUses > 4)
+ MultipleUses = false;
+ if (!AllUsesIdentical && !MultipleUses)
return nullptr;
}
- // Otherwise, we can replace *all* users with the new PHI we form.
}
// Check that all operands are phi-translatable.
@@ -1834,6 +1850,11 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
continue;
}
+ // Be conservative in MultipleUses case and do not allow non-simplified
+ // vals.
+ if (MultipleUses)
+ return nullptr;
+
if (SeenNonSimplifiedInVal)
return nullptr; // More than one non-simplified value.
SeenNonSimplifiedInVal = true;
@@ -1895,17 +1916,21 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
for (unsigned i = 0; i != NumPHIValues; ++i)
NewPN->addIncoming(NewPhiValues[i], PN->getIncomingBlock(i));
- for (User *U : make_early_inc_range(PN->users())) {
- Instruction *User = cast<Instruction>(U);
- if (User == &I)
- continue;
- replaceInstUsesWith(*User, NewPN);
- eraseInstFromFunction(*User);
+ if (AllUsesIdentical) {
+ for (User *U : make_early_inc_range(PN->users())) {
+ Instruction *User = cast<Instruction>(U);
+ if (User == &I)
+ continue;
+ replaceInstUsesWith(*User, NewPN);
+ eraseInstFromFunction(*User);
+ }
}
- replaceAllDbgUsesWith(const_cast<PHINode &>(*PN),
- const_cast<PHINode &>(*NewPN),
- const_cast<PHINode &>(*PN), DT);
+ if (!MultipleUses || AllUsesIdentical) {
+ replaceAllDbgUsesWith(const_cast<PHINode &>(*PN),
+ const_cast<PHINode &>(*NewPN),
+ const_cast<PHINode &>(*PN), DT);
+ }
return replaceInstUsesWith(I, NewPN);
}
diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll b/llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll
new file mode 100644
index 00000000000000..597eae101e9f2d
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -passes=instcombine | FileCheck %s
+
+define void @f(ptr %p_begin, ptr %p_end, ptr %out) {
+; CHECK-LABEL: define void @f(
+; CHECK-SAME: ptr [[P_BEGIN:%.*]], ptr [[P_END:%.*]], ptr [[OUT:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[ACC:%.*]] = phi <4 x float> [ zeroinitializer, %[[ENTRY]] ], [ [[SUM_LOWS:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[ODDS:%.*]] = phi <4 x float> [ zeroinitializer, %[[ENTRY]] ], [ [[SUM_HIGHS:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[P:%.*]] = phi ptr [ [[P_BEGIN]], %[[ENTRY]] ], [ [[P_INC:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[VAL:%.*]] = load <4 x i8>, ptr [[P]], align 4
+; CHECK-NEXT: [[HIGHS:%.*]] = ashr <4 x i8> [[VAL]], <i8 4, i8 4, i8 4, i8 4>
+; CHECK-NEXT: [[LOWS:%.*]] = and <4 x i8> [[VAL]], <i8 15, i8 15, i8 15, i8 15>
+; CHECK-NEXT: [[HIGHS_F:%.*]] = sitofp <4 x i8> [[HIGHS]] to <4 x float>
+; CHECK-NEXT: [[LOWS_F:%.*]] = uitofp nneg <4 x i8> [[LOWS]] to <4 x float>
+; CHECK-NEXT: [[SUM_LOWS]] = fadd <4 x float> [[ACC]], [[LOWS_F]]
+; CHECK-NEXT: [[SUM_HIGHS]] = fadd <4 x float> [[ODDS]], [[HIGHS_F]]
+; CHECK-NEXT: [[P_INC]] = getelementptr inbounds i8, ptr [[P]], i64 4
+; CHECK-NEXT: [[C:%.*]] = icmp eq ptr [[P_INC]], [[P_END]]
+; CHECK-NEXT: br i1 [[C]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[INTERLEAVE:%.*]] = shufflevector <4 x float> [[SUM_LOWS]], <4 x float> [[SUM_HIGHS]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT: store <8 x float> [[INTERLEAVE]], ptr [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %acc = phi <8 x float> [ zeroinitializer, %entry ], [ %interleave, %loop ]
+ %p = phi ptr [%p_begin, %entry ], [%p_inc, %loop]
+
+ %evens = shufflevector <8 x float> %acc, <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %odds = shufflevector <8 x float> %acc, <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
+ %val = load <4 x i8>, ptr %p, align 4
+ %highs = ashr <4 x i8> %val, <i8 4, i8 4, i8 4, i8 4>
+ %lows = and <4 x i8> %val, <i8 15, i8 15, i8 15, i8 15>
+
+ %highs_f = sitofp <4 x i8> %highs to <4 x float>
+ %lows_f = sitofp <4 x i8> %lows to <4 x float>
+
+ %sum_lows = fadd <4 x float> %evens, %lows_f
+ %sum_highs = fadd <4 x float> %odds, %highs_f
+
+ %interleave = shufflevector <4 x float> %sum_lows, <4 x float> %sum_highs, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+
+ %p_inc = getelementptr inbounds i8, ptr %p, i32 4
+ %c = icmp eq ptr %p_inc, %p_end
+ br i1 %c, label %exit, label %loop
+
+exit:
+ store <8 x float> %interleave, ptr %out, align 4
+ ret void
+}
|
This pattern happens when autovectorizing an important inner loop of fbgemm ( https://github.com/pytorch/FBGEMM/blob/119052f4dddc08064ea32d058c05055f1a25ec74/src/EmbeddingSpMDMAutovec.cc#L418 ); a simplified version of this is shown in the new |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Please wait for additional approval from other reviewers.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I'm reading this correctly, then nothing here actually guarantees that all shufflevectors are going to simplify? You just check that all uses are shufflevectors, but then simplify each individually.
If we say that we consider it profitable to replace a shufflevector with a phi in general, then we shouldn't need the check on the uses for shufflevectors at all.
See L1851-1855:
|
Right, but doesn't think just check that this shufflevector simplifies (for all phi inputs), not that all shufflevectors simplify? |
new versions checks that all phi users simplify. |
My assumption is that turning one Phi into multiple Phi instructions is not necessarily an improvement. I could see this being an improvement on average, but it's hard to measure make statements about average code. Given it's not a clear improvement I am not sure I want to deal with the complaints for the regressions and rather would be more conservative... |
I'd rather start with a simple implementation and then make it more complex if needed, rather than start with a complex implementation from the start "just in case". If we really want to check all shufflevectors, I think the transform should be further separated from the existing one and simplify all the shufflevectors at once. It makes little sense to check that all of them simplify and compute the replacement values, then discard all of that and replace a single one of them, and then repeat the whole process again for the next shufflevector. |
|
I'm saying we should try the latter first and if that fails go to the former. |
745fac7
to
95dd2c5
Compare
Here's the simpler version just optimistically transforming I did So worth a try optimistically enabling this. [1] |
95dd2c5
to
6f74d52
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry for being unclear, I do think that you should still keep the !SeenNonSimplifiedInVal check for the multi-use case. Without that check we could end up with both the original phis and extra new shufflevectors (rather than replacing shufflevectors with phis).
updated to require all phi inputs to simplify if there were multiple uses. |
@nikic now good to go? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
We would like to optimize situations of the form that happen after loop vectorization+SROA:
where the interleave and de-interleave shuffle operations cancel each other out.
This could be handled by
foldOpPhi
but does not currently work because it doesnot proceed when there are multiple uses of the
Phi
operation.This extends
foldOpPhi
to allow multipleshufflevector
uses when they areshown to simplify for all
Phi
input values.