Skip to content

Commit 3a6108b

Browse files
authored
[SLP][REVEC] Fix scalar mask is passed to getScalarizationOverhead but the type is vector. (#128476)
Fix "Vector size mismatch".
1 parent 2dfb29a commit 3a6108b

File tree

2 files changed

+65
-4
lines changed

2 files changed

+65
-4
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13968,15 +13968,31 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
1396813968
ShuffledElements.setBit(I);
1396913969
ShuffleMask[I] = Res.first->second;
1397013970
}
13971-
if (!DemandedElements.isZero())
13972-
Cost +=
13973-
TTI->getScalarizationOverhead(VecTy, DemandedElements, /*Insert=*/true,
13974-
/*Extract=*/false, CostKind, VL);
13971+
if (!DemandedElements.isZero()) {
13972+
if (isa<FixedVectorType>(ScalarTy)) {
13973+
assert(SLPReVec && "Only supported by REVEC.");
13974+
// We don't need to insert elements one by one. Instead, we can insert the
13975+
// entire vector into the destination.
13976+
Cost = 0;
13977+
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
13978+
for (unsigned I : seq<unsigned>(VL.size()))
13979+
if (DemandedElements[I])
13980+
Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {},
13981+
CostKind, I * ScalarTyNumElements,
13982+
cast<FixedVectorType>(ScalarTy));
13983+
} else {
13984+
Cost += TTI->getScalarizationOverhead(VecTy, DemandedElements,
13985+
/*Insert=*/true,
13986+
/*Extract=*/false, CostKind, VL);
13987+
}
13988+
}
1397513989
if (ForPoisonSrc) {
1397613990
if (isa<FixedVectorType>(ScalarTy)) {
1397713991
assert(SLPReVec && "Only supported by REVEC.");
1397813992
// We don't need to insert elements one by one. Instead, we can insert the
1397913993
// entire vector into the destination.
13994+
assert(DemandedElements.isZero() &&
13995+
"Need to consider the cost from DemandedElements.");
1398013996
Cost = 0;
1398113997
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1398213998
for (unsigned I : seq<unsigned>(VL.size()))
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -mtriple=s390x-unknown-linux-gnu -mcpu=arch15 -passes=slp-vectorizer -S -slp-revec %s | FileCheck %s
3+
4+
define void @e(<4 x i16> %0) {
5+
; CHECK-LABEL: @e(
6+
; CHECK-NEXT: entry:
7+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
8+
; CHECK: vector.body:
9+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ zeroinitializer, [[ENTRY:%.*]] ], [ zeroinitializer, [[VECTOR_BODY]] ]
10+
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
11+
; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i16> [[VEC_IND]], zeroinitializer
12+
; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
13+
; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i16> [[VEC_IND]], zeroinitializer
14+
; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i16> [[TMP0:%.*]], zeroinitializer
15+
; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i1> [[TMP4]] to <4 x i32>
16+
; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i32> [[TMP2]], [[TMP5]]
17+
; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i16> [[TMP3]], zeroinitializer
18+
; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i1> [[TMP7]] to <4 x i32>
19+
; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP6]], [[TMP8]]
20+
; CHECK-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i16> zeroinitializer, zeroinitializer
21+
; CHECK-NEXT: [[TMP11:%.*]] = zext <4 x i1> [[TMP10]] to <4 x i32>
22+
; CHECK-NEXT: [[TMP12]] = or <4 x i32> [[TMP9]], [[TMP11]]
23+
; CHECK-NEXT: br label [[VECTOR_BODY]]
24+
;
25+
entry:
26+
br label %vector.body
27+
28+
vector.body: ; preds = %vector.body, %entry
29+
%vec.ind = phi <4 x i16> [ zeroinitializer, %entry ], [ zeroinitializer, %vector.body ]
30+
%vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %13, %vector.body ]
31+
%1 = icmp sgt <4 x i16> %vec.ind, zeroinitializer
32+
%2 = zext <4 x i1> %1 to <4 x i32>
33+
%3 = add <4 x i16> %vec.ind, zeroinitializer
34+
%4 = icmp sgt <4 x i16> %0, zeroinitializer
35+
%5 = zext <4 x i1> %4 to <4 x i32>
36+
%6 = or <4 x i32> %2, %5
37+
%7 = add <4 x i16> zeroinitializer, zeroinitializer
38+
%8 = icmp sgt <4 x i16> %3, zeroinitializer
39+
%9 = zext <4 x i1> %8 to <4 x i32>
40+
%10 = or <4 x i32> %6, %9
41+
%11 = icmp sgt <4 x i16> %7, zeroinitializer
42+
%12 = zext <4 x i1> %11 to <4 x i32>
43+
%13 = or <4 x i32> %10, %12
44+
br label %vector.body
45+
}

0 commit comments

Comments
 (0)