Skip to content

Commit eeeb18c

Browse files
committed
[X86] Change the behavior of canWidenShuffleElements used by lowerV2X128Shuffle to match the behavior in lowerVectorShuffle with regards to zeroable elements.
Previously we marked zeroable elements in a way that prevented the widening check from recognizing that it could widen. Now we only mark them zeroable if V2 is an all zeros vector. This matches what we do for widening elements in lowerVectorShuffle. Fixes PR43866.
1 parent deaf121 commit eeeb18c

File tree

2 files changed

+51
-19
lines changed

2 files changed

+51
-19
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5324,15 +5324,18 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
53245324

53255325
static bool canWidenShuffleElements(ArrayRef<int> Mask,
53265326
const APInt &Zeroable,
5327+
bool V2IsZero,
53275328
SmallVectorImpl<int> &WidenedMask) {
5328-
SmallVector<int, 32> TargetMask(Mask.begin(), Mask.end());
5329-
for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
5330-
if (TargetMask[i] == SM_SentinelUndef)
5331-
continue;
5332-
if (Zeroable[i])
5333-
TargetMask[i] = SM_SentinelZero;
5329+
// Create an alternative mask with info about zeroable elements.
5330+
// Here we do not set undef elements as zeroable.
5331+
SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
5332+
if (V2IsZero) {
5333+
assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
5334+
for (int i = 0, Size = Mask.size(); i != Size; ++i)
5335+
if (Mask[i] != SM_SentinelUndef && Zeroable[i])
5336+
ZeroableMask[i] = SM_SentinelZero;
53345337
}
5335-
return canWidenShuffleElements(TargetMask, WidenedMask);
5338+
return canWidenShuffleElements(ZeroableMask, WidenedMask);
53365339
}
53375340

53385341
static bool canWidenShuffleElements(ArrayRef<int> Mask) {
@@ -14817,8 +14820,10 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
1481714820
if (Subtarget.hasAVX2() && V2.isUndef())
1481814821
return SDValue();
1481914822

14823+
bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
14824+
1482014825
SmallVector<int, 4> WidenedMask;
14821-
if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask))
14826+
if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
1482214827
return SDValue();
1482314828

1482414829
bool IsLowZero = (Zeroable & 0x3) == 0x3;
@@ -17095,23 +17100,13 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
1709517100

1709617101
bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
1709717102

17098-
// Create an alternative mask with info about zeroable elements.
17099-
// Here we do not set undef elements as zeroable.
17100-
SmallVector<int, 64> ZeroableMask(OrigMask.begin(), OrigMask.end());
17101-
if (V2IsZero) {
17102-
assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
17103-
for (int i = 0; i != NumElements; ++i)
17104-
if (OrigMask[i] != SM_SentinelUndef && Zeroable[i])
17105-
ZeroableMask[i] = SM_SentinelZero;
17106-
}
17107-
1710817103
// Try to collapse shuffles into using a vector type with fewer elements but
1710917104
// wider element types. We cap this to not form integers or floating point
1711017105
// elements wider than 64 bits, but it might be interesting to form i128
1711117106
// integers to handle flipping the low and high halves of AVX 256-bit vectors.
1711217107
SmallVector<int, 16> WidenedMask;
1711317108
if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
17114-
canWidenShuffleElements(ZeroableMask, WidenedMask)) {
17109+
canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
1711517110
// Shuffle mask widening should not interfere with a broadcast opportunity
1711617111
// by obfuscating the operands with bitcasts.
1711717112
// TODO: Avoid lowering directly from this top-level function: make this

llvm/test/CodeGen/X86/pr43866.ll

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s
3+
4+
@v2_0 = global <2 x i32> zeroinitializer, align 8
5+
6+
define void @test() {
7+
; CHECK-LABEL: test:
8+
; CHECK: # %bb.0: # %entry
9+
; CHECK-NEXT: pushq %rbp
10+
; CHECK-NEXT: .cfi_def_cfa_offset 16
11+
; CHECK-NEXT: .cfi_offset %rbp, -16
12+
; CHECK-NEXT: movq %rsp, %rbp
13+
; CHECK-NEXT: .cfi_def_cfa_register %rbp
14+
; CHECK-NEXT: andq $-32, %rsp
15+
; CHECK-NEXT: subq $64, %rsp
16+
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
17+
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
18+
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
19+
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
20+
; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2
21+
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
22+
; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
23+
; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[0,0],ymm1[6,4],ymm0[4,4]
24+
; CHECK-NEXT: vmovaps %ymm0, (%rsp)
25+
; CHECK-NEXT: movq %rbp, %rsp
26+
; CHECK-NEXT: popq %rbp
27+
; CHECK-NEXT: .cfi_def_cfa %rsp, 8
28+
; CHECK-NEXT: vzeroupper
29+
; CHECK-NEXT: retq
30+
entry:
31+
%v8_0 = alloca <8 x i32>, align 32
32+
%v8_0.0.v8_0.0..sroa_cast = bitcast <8 x i32>* %v8_0 to i8*
33+
%0 = load <2 x i32>, <2 x i32>* @v2_0, align 8
34+
%shuffle = shufflevector <2 x i32> %0, <2 x i32> <i32 -1, i32 -1>, <8 x i32> <i32 1, i32 3, i32 0, i32 0, i32 3, i32 3, i32 2, i32 2>
35+
store volatile <8 x i32> %shuffle, <8 x i32>* %v8_0, align 32
36+
ret void
37+
}

0 commit comments

Comments
 (0)