Skip to content

Commit 035e64c

Browse files
committed
[VectorCombine] eraseInstruction - ensure we reattempt to fold other users of an erased instruction's operands (REAPPLIED)
As we're reducing the use count of the operands its more likely that they will now fold, as they were previously being prevented by a m_OneUse check, or the cost of retaining the extra instruction had been too high. This is necessary for some upcoming patches, although the only change so far is instruction ordering as it allows some SSE folds of 256/512-bit with 128-bit subvectors to occur earlier in foldShuffleToIdentity as the subvector concats are free. Reapplied with a fix for foldSingleElementStore/scalarizeLoadExtract which were replacing/removing memory operations - we need to ensure that the worklist is populated in the correct order so all users of the old memory operations are erased first, so there are no remaining users of the loads when its time to remove them as well. Pulled out of #120984
1 parent dd30aa8 commit 035e64c

File tree

2 files changed

+66
-17
lines changed

2 files changed

+66
-17
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,10 +141,17 @@ class VectorCombine {
141141

142142
void eraseInstruction(Instruction &I) {
143143
LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
144-
for (Value *Op : I.operands())
145-
Worklist.pushValue(Op);
144+
SmallVector<Value *> Ops(I.operands());
146145
Worklist.remove(&I);
147146
I.eraseFromParent();
147+
148+
// Push remaining users of the operands and then the operand itself - allows
149+
// further folds that were hindered by OneUse limits.
150+
for (Value *Op : Ops)
151+
if (auto *OpI = dyn_cast<Instruction>(Op)) {
152+
Worklist.pushUsersToWorkList(*OpI);
153+
Worklist.pushValue(OpI);
154+
}
148155
}
149156
};
150157
} // namespace
@@ -1337,6 +1344,10 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) {
13371344
MemoryLocation::get(SI), AA))
13381345
return false;
13391346

1347+
// Ensure we add the load back to the worklist BEFORE its users so they can
1348+
// erased in the correct order.
1349+
Worklist.push(Load);
1350+
13401351
if (ScalarizableIdx.isSafeWithFreeze())
13411352
ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
13421353
Value *GEP = Builder.CreateInBoundsGEP(
@@ -1425,6 +1436,10 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
14251436
if (ScalarizedCost >= OriginalCost)
14261437
return false;
14271438

1439+
// Ensure we add the load back to the worklist BEFORE its users so they can
1440+
// erased in the correct order.
1441+
Worklist.push(LI);
1442+
14281443
// Replace extracts with narrow scalar loads.
14291444
for (User *U : LI->users()) {
14301445
auto *EI = cast<ExtractElementInst>(U);

llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll

Lines changed: 49 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,29 @@ define i64 @movmsk_i64_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
8080
}
8181

8282
define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
83-
; CHECK-LABEL: @movmsk_i64_v64i8_v16i8(
84-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
85-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
86-
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
87-
; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
88-
; CHECK-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
89-
; CHECK-NEXT: ret i64 [[OR]]
83+
; SSE-LABEL: @movmsk_i64_v64i8_v16i8(
84+
; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
85+
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
86+
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
87+
; SSE-NEXT: [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
88+
; SSE-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
89+
; SSE-NEXT: ret i64 [[OR]]
90+
;
91+
; AVX2-LABEL: @movmsk_i64_v64i8_v16i8(
92+
; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
93+
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
94+
; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
95+
; AVX2-NEXT: [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
96+
; AVX2-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
97+
; AVX2-NEXT: ret i64 [[OR]]
98+
;
99+
; AVX512-LABEL: @movmsk_i64_v64i8_v16i8(
100+
; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
101+
; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
102+
; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
103+
; AVX512-NEXT: [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
104+
; AVX512-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
105+
; AVX512-NEXT: ret i64 [[OR]]
90106
;
91107
%c0 = icmp slt <16 x i8> %v0, zeroinitializer
92108
%c1 = icmp slt <16 x i8> %v1, zeroinitializer
@@ -110,14 +126,32 @@ define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2,
110126
}
111127

112128
define i64 @movmsk_i64_v32i32_v4i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
113-
; CHECK-LABEL: @movmsk_i64_v32i32_v4i32(
114-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
115-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
116-
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
117-
; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
118-
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
119-
; CHECK-NEXT: [[OR:%.*]] = zext i16 [[TMP5]] to i64
120-
; CHECK-NEXT: ret i64 [[OR]]
129+
; SSE-LABEL: @movmsk_i64_v32i32_v4i32(
130+
; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
131+
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
132+
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
133+
; SSE-NEXT: [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
134+
; SSE-NEXT: [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
135+
; SSE-NEXT: [[OR:%.*]] = zext i16 [[TMP5]] to i64
136+
; SSE-NEXT: ret i64 [[OR]]
137+
;
138+
; AVX2-LABEL: @movmsk_i64_v32i32_v4i32(
139+
; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
140+
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
141+
; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
142+
; AVX2-NEXT: [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
143+
; AVX2-NEXT: [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
144+
; AVX2-NEXT: [[OR:%.*]] = zext i16 [[TMP5]] to i64
145+
; AVX2-NEXT: ret i64 [[OR]]
146+
;
147+
; AVX512-LABEL: @movmsk_i64_v32i32_v4i32(
148+
; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
149+
; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
150+
; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
151+
; AVX512-NEXT: [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
152+
; AVX512-NEXT: [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
153+
; AVX512-NEXT: [[OR:%.*]] = zext i16 [[TMP5]] to i64
154+
; AVX512-NEXT: ret i64 [[OR]]
121155
;
122156
%c0 = icmp slt <4 x i32> %v0, zeroinitializer
123157
%c1 = icmp slt <4 x i32> %v1, zeroinitializer

0 commit comments

Comments
 (0)