Skip to content

Commit 6c1d445

Browse files
[SLP]Improve minbitwidth analysis for shifts.
Adds improved bitwidth analysis for shl/ashr/lshr instructions. The analysis is based on similar version in InstCombiner. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm#84356
1 parent 81d9ed6 commit 6c1d445

File tree

3 files changed

+109
-19
lines changed

3 files changed

+109
-19
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 94 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13995,9 +13995,11 @@ bool BoUpSLP::collectValuesToDemote(
1399513995
if (MultiNodeScalars.contains(V))
1399613996
return false;
1399713997
uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType());
13998-
APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
13999-
if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
14000-
return true;
13998+
if (OrigBitWidth < BitWidth) {
13999+
APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14000+
if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
14001+
return true;
14002+
}
1400114003
auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
1400214004
unsigned BitWidth1 = OrigBitWidth - NumSignBits;
1400314005
if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
@@ -14042,6 +14044,30 @@ bool BoUpSLP::collectValuesToDemote(
1404214044
}
1404314045
return true;
1404414046
};
14047+
auto AttemptCheckBitwidth =
14048+
[&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
14049+
// Try all bitwidth < OrigBitWidth.
14050+
NeedToExit = false;
14051+
uint32_t OrigBitWidth = DL->getTypeSizeInBits(I->getType());
14052+
unsigned BestFailBitwidth = 0;
14053+
for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
14054+
if (Checker(BitWidth, OrigBitWidth))
14055+
return true;
14056+
if (BestFailBitwidth == 0 && FinalAnalysis())
14057+
BestFailBitwidth = BitWidth;
14058+
}
14059+
if (BitWidth >= OrigBitWidth) {
14060+
if (BestFailBitwidth == 0) {
14061+
BitWidth = OrigBitWidth;
14062+
return false;
14063+
}
14064+
MaxDepthLevel = 1;
14065+
BitWidth = BestFailBitwidth;
14066+
NeedToExit = true;
14067+
return true;
14068+
}
14069+
return false;
14070+
};
1404514071
bool NeedToExit = false;
1404614072
switch (I->getOpcode()) {
1404714073

@@ -14074,6 +14100,71 @@ bool BoUpSLP::collectValuesToDemote(
1407414100
return false;
1407514101
break;
1407614102
}
14103+
case Instruction::Shl: {
14104+
// Several vectorized uses? Check if we can truncate it, otherwise - exit.
14105+
if (ITE->UserTreeIndices.size() > 1 && !IsPotentiallyTruncated(I, BitWidth))
14106+
return false;
14107+
// If we are truncating the result of this SHL, and if it's a shift of an
14108+
// inrange amount, we can always perform a SHL in a smaller type.
14109+
if (!AttemptCheckBitwidth(
14110+
[&](unsigned BitWidth, unsigned) {
14111+
KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14112+
return AmtKnownBits.getMaxValue().ult(BitWidth);
14113+
},
14114+
NeedToExit))
14115+
return false;
14116+
if (NeedToExit)
14117+
return true;
14118+
if (!ProcessOperands({I->getOperand(0), I->getOperand(1)}, NeedToExit))
14119+
return false;
14120+
break;
14121+
}
14122+
case Instruction::LShr: {
14123+
// Several vectorized uses? Check if we can truncate it, otherwise - exit.
14124+
if (ITE->UserTreeIndices.size() > 1 && !IsPotentiallyTruncated(I, BitWidth))
14125+
return false;
14126+
// If this is a truncate of a logical shr, we can truncate it to a smaller
14127+
// lshr iff we know that the bits we would otherwise be shifting in are
14128+
// already zeros.
14129+
if (!AttemptCheckBitwidth(
14130+
[&](unsigned BitWidth, unsigned OrigBitWidth) {
14131+
KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14132+
APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14133+
return AmtKnownBits.getMaxValue().ult(BitWidth) &&
14134+
MaskedValueIsZero(I->getOperand(0), ShiftedBits,
14135+
SimplifyQuery(*DL));
14136+
},
14137+
NeedToExit))
14138+
return false;
14139+
if (NeedToExit)
14140+
return true;
14141+
if (!ProcessOperands({I->getOperand(0), I->getOperand(1)}, NeedToExit))
14142+
return false;
14143+
break;
14144+
}
14145+
case Instruction::AShr: {
14146+
// Several vectorized uses? Check if we can truncate it, otherwise - exit.
14147+
if (ITE->UserTreeIndices.size() > 1 && !IsPotentiallyTruncated(I, BitWidth))
14148+
return false;
14149+
// If this is a truncate of an arithmetic shr, we can truncate it to a
14150+
// smaller ashr iff we know that all the bits from the sign bit of the
14151+
// original type and the sign bit of the truncate type are similar.
14152+
if (!AttemptCheckBitwidth(
14153+
[&](unsigned BitWidth, unsigned OrigBitWidth) {
14154+
KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14155+
unsigned ShiftedBits = OrigBitWidth - BitWidth;
14156+
return AmtKnownBits.getMaxValue().ult(BitWidth) &&
14157+
ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0,
14158+
AC, nullptr, DT);
14159+
},
14160+
NeedToExit))
14161+
return false;
14162+
if (NeedToExit)
14163+
return true;
14164+
if (!ProcessOperands({I->getOperand(0), I->getOperand(1)}, NeedToExit))
14165+
return false;
14166+
break;
14167+
}
1407714168

1407814169
// We can demote selects if we can demote their true and false values.
1407914170
case Instruction::Select: {

llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,8 @@ define void @test() {
1010
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
1111
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1212
; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
13-
; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i32> [[TMP3]] to <4 x i64>
14-
; CHECK-NEXT: [[TMP5:%.*]] = ashr <4 x i64> [[TMP4]], zeroinitializer
15-
; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i64> [[TMP5]] to <4 x i32>
16-
; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
13+
; CHECK-NEXT: [[TMP4:%.*]] = ashr <4 x i32> [[TMP3]], zeroinitializer
14+
; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
1715
; CHECK-NEXT: ret void
1816
;
1917
entry:

llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,19 @@ define void @test() {
55
; CHECK-LABEL: @test(
66
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr undef, i64 4
77
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [4 x [4 x i32]], ptr undef, i64 0, i64 1, i64 0
8-
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
9-
; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
10-
; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP5]]
11-
; CHECK-NEXT: [[TMP7:%.*]] = shl nsw <4 x i32> [[TMP6]], zeroinitializer
12-
; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], zeroinitializer
13-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
14-
; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[TMP8]], [[TMP9]]
15-
; CHECK-NEXT: [[TMP11:%.*]] = sub nsw <4 x i32> [[TMP8]], [[TMP9]]
16-
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> <i32 1, i32 4, i32 3, i32 6>
17-
; CHECK-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP12]]
18-
; CHECK-NEXT: [[TMP14:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP12]]
19-
; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
8+
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
9+
; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i16>
10+
; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i16> zeroinitializer, [[TMP4]]
11+
; CHECK-NEXT: [[TMP6:%.*]] = shl <4 x i16> [[TMP5]], zeroinitializer
12+
; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i16> [[TMP6]], zeroinitializer
13+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i16> [[TMP7]], <4 x i16> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
14+
; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i16> [[TMP7]], [[TMP8]]
15+
; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <4 x i16> [[TMP7]], [[TMP8]]
16+
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i32> <i32 1, i32 4, i32 3, i32 6>
17+
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <4 x i16> zeroinitializer, [[TMP11]]
18+
; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <4 x i16> zeroinitializer, [[TMP11]]
19+
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
20+
; CHECK-NEXT: [[TMP15:%.*]] = sext <4 x i16> [[TMP14]] to <4 x i32>
2021
; CHECK-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP2]], align 16
2122
; CHECK-NEXT: ret void
2223
;

0 commit comments

Comments
 (0)