Skip to content

Commit 71d0fd3

Browse files
committed
[X86][AVX] lowerV2X128Shuffle - attempt to recognise broadcastf128 subvector load
As noticed on PR50053 we were failing to recognise when a shuffle of a load was really a subvector broadcast load
1 parent b63833a commit 71d0fd3

File tree

2 files changed

+37
-23
lines changed

2 files changed

+37
-23
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16054,9 +16054,33 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
1605416054
const APInt &Zeroable,
1605516055
const X86Subtarget &Subtarget,
1605616056
SelectionDAG &DAG) {
16057-
// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
16058-
if (Subtarget.hasAVX2() && V2.isUndef())
16059-
return SDValue();
16057+
if (V2.isUndef()) {
16058+
// Attempt to match VBROADCAST*128 subvector broadcast load.
16059+
bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
16060+
bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
16061+
if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
16062+
MayFoldLoad(peekThroughOneUseBitcasts(V1))) {
16063+
auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
16064+
if (!Ld->isNonTemporal()) {
16065+
MVT MemVT = VT.getHalfNumVectorElementsVT();
16066+
unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
16067+
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
16068+
SDValue Ptr = DAG.getMemBasePlusOffset(Ld->getBasePtr(),
16069+
TypeSize::Fixed(Ofs), DL);
16070+
SDValue Ops[] = {Ld->getChain(), Ptr};
16071+
SDValue BcastLd = DAG.getMemIntrinsicNode(
16072+
X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops, MemVT,
16073+
DAG.getMachineFunction().getMachineMemOperand(
16074+
Ld->getMemOperand(), Ofs, MemVT.getStoreSize()));
16075+
DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
16076+
return BcastLd;
16077+
}
16078+
}
16079+
16080+
// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
16081+
if (Subtarget.hasAVX2())
16082+
return SDValue();
16083+
}
1606016084

1606116085
bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
1606216086

llvm/test/CodeGen/X86/avx-vperm2x128.ll

Lines changed: 10 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
2-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
3-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -disable-peephole | FileCheck %s --check-prefixes=ALL,AVX1
3+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -disable-peephole | FileCheck %s --check-prefixes=ALL,AVX2
44

55
define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
66
; AVX1-LABEL: shuffle_v8f32_45670123:
@@ -60,15 +60,10 @@ entry:
6060
}
6161

6262
define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
63-
; AVX1-LABEL: shuffle_v8f32_01230123_mem:
64-
; AVX1: # %bb.0: # %entry
65-
; AVX1-NEXT: vperm2f128 $34, (%rdi), %ymm0, %ymm0 # ymm0 = mem[0,1,0,1]
66-
; AVX1-NEXT: retq
67-
;
68-
; AVX2-LABEL: shuffle_v8f32_01230123_mem:
69-
; AVX2: # %bb.0: # %entry
70-
; AVX2-NEXT: vpermpd $68, (%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
71-
; AVX2-NEXT: retq
63+
; ALL-LABEL: shuffle_v8f32_01230123_mem:
64+
; ALL: # %bb.0: # %entry
65+
; ALL-NEXT: vbroadcastf128 (%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
66+
; ALL-NEXT: retq
7267
entry:
7368
%a = load <8 x float>, <8 x float>* %pa
7469
%b = load <8 x float>, <8 x float>* %pb
@@ -92,15 +87,10 @@ entry:
9287
}
9388

9489
define <8 x float> @shuffle_v8f32_45674567_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
95-
; AVX1-LABEL: shuffle_v8f32_45674567_mem:
96-
; AVX1: # %bb.0: # %entry
97-
; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3]
98-
; AVX1-NEXT: retq
99-
;
100-
; AVX2-LABEL: shuffle_v8f32_45674567_mem:
101-
; AVX2: # %bb.0: # %entry
102-
; AVX2-NEXT: vpermpd $238, (%rdi), %ymm0 # ymm0 = mem[2,3,2,3]
103-
; AVX2-NEXT: retq
90+
; ALL-LABEL: shuffle_v8f32_45674567_mem:
91+
; ALL: # %bb.0: # %entry
92+
; ALL-NEXT: vbroadcastf128 16(%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
93+
; ALL-NEXT: retq
10494
entry:
10595
%a = load <8 x float>, <8 x float>* %pa
10696
%b = load <8 x float>, <8 x float>* %pb

0 commit comments

Comments
 (0)