Skip to content

Commit 5ff3674

Browse files
committed
[SLP]Fix mask processing for reused gathered scalars
Need to sync the mask between cost and actual emission to avoid bugs in mask calculation Fixes #122324
1 parent 26aa20a commit 5ff3674

File tree

2 files changed

+56
-1
lines changed

2 files changed

+56
-1
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10973,7 +10973,19 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
1097310973
}
1097410974
}
1097510975

10976-
::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
10976+
if (!ExtMask.empty()) {
10977+
if (CommonMask.empty()) {
10978+
CommonMask.assign(ExtMask.begin(), ExtMask.end());
10979+
} else {
10980+
SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
10981+
for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
10982+
if (ExtMask[I] == PoisonMaskElem)
10983+
continue;
10984+
NewMask[I] = CommonMask[ExtMask[I]];
10985+
}
10986+
CommonMask.swap(NewMask);
10987+
}
10988+
}
1097710989
if (CommonMask.empty()) {
1097810990
assert(InVectors.size() == 1 && "Expected only one vector with no mask");
1097910991
return Cost;
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
3+
4+
define i1 @test() {
5+
; CHECK-LABEL: define i1 @test() {
6+
; CHECK-NEXT: [[ENTRY:.*:]]
7+
; CHECK-NEXT: [[H_PROMOTED118_I_FR:%.*]] = freeze i32 1
8+
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, i32 [[H_PROMOTED118_I_FR]], i32 2
9+
; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i32> zeroinitializer, [[TMP0]]
10+
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> zeroinitializer, [[TMP0]]
11+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
12+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP3]], <4 x i32> <i32 2, i32 2, i32 7, i32 2>
13+
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP3]], [[TMP4]]
14+
; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i32> [[TMP5]], <i32 0, i32 1, i32 1, i32 1>
15+
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i32> [[TMP6]], <i32 1, i32 0, i32 0, i32 0>
16+
; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
17+
; CHECK-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP8]], false
18+
; CHECK-NEXT: ret i1 [[OP_RDX]]
19+
;
20+
entry:
21+
%h.promoted118.i.fr = freeze i32 1
22+
%invariant.op.i51 = add i32 %h.promoted118.i.fr, 0
23+
%conv25.i = xor i32 0, 0
24+
%add.i.i = add i32 %conv25.i, %h.promoted118.i.fr
25+
%sext.i.mask = and i32 %add.i.i, 0
26+
%cmp27.i = icmp eq i32 %sext.i.mask, 1
27+
%0 = or i1 %cmp27.i, false
28+
%conv25.i.1 = add i32 0, 0
29+
%add.i.i.1 = add i32 %conv25.i.1, %h.promoted118.i.fr
30+
%sext.i.1.mask = and i32 %add.i.i.1, 1
31+
%cmp27.i.1 = icmp eq i32 %sext.i.1.mask, 0
32+
%conv25.1.i.1 = xor i32 0, 0
33+
%add.i.1.i.1 = add i32 %conv25.1.i.1, %h.promoted118.i.fr
34+
%sext.1.i.1.mask = and i32 %add.i.1.i.1, 1
35+
%cmp27.1.i.1 = icmp eq i32 %sext.1.i.1.mask, 0
36+
%add.i.2.reass.i.1 = add i32 %invariant.op.i51, %conv25.i.1
37+
%sext.2.i.1.mask = and i32 %add.i.2.reass.i.1, 1
38+
%cmp27.2.i.1 = icmp eq i32 %sext.2.i.1.mask, 0
39+
%1 = or i1 %cmp27.1.i.1, %cmp27.2.i.1
40+
%2 = or i1 %cmp27.i.1, %1
41+
%3 = or i1 %0, %2
42+
ret i1 %3
43+
}

0 commit comments

Comments
 (0)