Skip to content

Commit b96a2c7

Browse files
committed
[X86][AVX] Enable AVX1 broadcasts in shuffle combining
Enables 32/64-bit scalar load broadcasts on AVX1 targets The extractelement-load.ll regression will be fixed shortly in a followup commit. llvm-svn: 352743
1 parent 51c2efc commit b96a2c7

File tree

8 files changed

+29
-24
lines changed

8 files changed

+29
-24
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31035,15 +31035,27 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3103531035
}
3103631036

3103731037
// Attempt to match against broadcast-from-vector.
31038-
// TODO: Add (partial) AVX1 support.
31039-
if (Subtarget.hasAVX2() && (!IsEVEXShuffle || NumRootElts == NumMaskElts)) {
31038+
// Limit AVX1 to cases where we're loading+broadcasting a scalar element.
31039+
if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits))
31040+
&& (!IsEVEXShuffle || NumRootElts == NumMaskElts)) {
3104031041
SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
3104131042
if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
31042-
if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
31043-
return SDValue(); // Nothing to do!
31044-
Res = DAG.getBitcast(MaskVT, V1);
31045-
Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
31046-
return DAG.getBitcast(RootVT, Res);
31043+
if (V1.getValueType() == MaskVT &&
31044+
V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
31045+
MayFoldLoad(V1.getOperand(0))) {
31046+
if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
31047+
return SDValue(); // Nothing to do!
31048+
Res = V1.getOperand(0);
31049+
Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
31050+
return DAG.getBitcast(RootVT, Res);
31051+
}
31052+
if (Subtarget.hasAVX2()) {
31053+
if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
31054+
return SDValue(); // Nothing to do!
31055+
Res = DAG.getBitcast(MaskVT, V1);
31056+
Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
31057+
return DAG.getBitcast(RootVT, Res);
31058+
}
3104731059
}
3104831060
}
3104931061

llvm/test/CodeGen/X86/avx-vbroadcast.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -596,8 +596,7 @@ define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp {
596596
;
597597
; X64-LABEL: G:
598598
; X64: ## %bb.0: ## %entry
599-
; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
600-
; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
599+
; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
601600
; X64-NEXT: retq
602601
entry:
603602
%q = load i64, i64* %ptr, align 8

llvm/test/CodeGen/X86/extractelement-load.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,8 @@ define i64 @t4(<2 x double>* %a) {
9898
;
9999
; X64-AVX-LABEL: t4:
100100
; X64-AVX: # %bb.0:
101-
; X64-AVX-NEXT: movq (%rdi), %rax
101+
; X64-AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
102+
; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax
102103
; X64-AVX-NEXT: retq
103104
%b = load <2 x double>, <2 x double>* %a, align 16
104105
%c = shufflevector <2 x double> %b, <2 x double> %b, <2 x i32> <i32 1, i32 0>

llvm/test/CodeGen/X86/insert-into-constant-vector.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -273,8 +273,7 @@ define <8 x i32> @elt7_v8i32(i32 %x) {
273273
;
274274
; X32AVX1-LABEL: elt7_v8i32:
275275
; X32AVX1: # %bb.0:
276-
; X32AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
277-
; X32AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0]
276+
; X32AVX1-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm0
278277
; X32AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
279278
; X32AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
280279
; X32AVX1-NEXT: retl

llvm/test/CodeGen/X86/insert-loaded-scalar.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,7 @@ define <2 x i64> @load64_ins_eltc_v2i64(i64* %p) nounwind {
180180
;
181181
; AVX1-LABEL: load64_ins_eltc_v2i64:
182182
; AVX1: # %bb.0:
183-
; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
184-
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
183+
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
185184
; AVX1-NEXT: retq
186185
;
187186
; AVX2-LABEL: load64_ins_eltc_v2i64:

llvm/test/CodeGen/X86/insertelement-var-index.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -205,8 +205,7 @@ define <2 x i64> @load_i64_v2i64(i64* %p, i32 %y) nounwind {
205205
;
206206
; AVX1-LABEL: load_i64_v2i64:
207207
; AVX1: # %bb.0:
208-
; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
209-
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
208+
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
210209
; AVX1-NEXT: retq
211210
;
212211
; AVX2-LABEL: load_i64_v2i64:

llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1266,8 +1266,7 @@ define <2 x i64> @insert_dup_mem_v2i64(i64* %ptr) {
12661266
;
12671267
; AVX1-LABEL: insert_dup_mem_v2i64:
12681268
; AVX1: # %bb.0:
1269-
; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1270-
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1269+
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
12711270
; AVX1-NEXT: retq
12721271
;
12731272
; AVX2-LABEL: insert_dup_mem_v2i64:

llvm/test/CodeGen/X86/widened-broadcast.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -582,8 +582,7 @@ define <4 x i32> @load_splat_4i32_2i32_0101(<2 x i32>* %vp) {
582582
;
583583
; AVX1-LABEL: load_splat_4i32_2i32_0101:
584584
; AVX1: # %bb.0:
585-
; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
586-
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
585+
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
587586
; AVX1-NEXT: retq
588587
;
589588
; AVX2-LABEL: load_splat_4i32_2i32_0101:
@@ -610,8 +609,7 @@ define <8 x i32> @load_splat_8i32_2i32_0101(<2 x i32>* %vp) {
610609
;
611610
; AVX1-LABEL: load_splat_8i32_2i32_0101:
612611
; AVX1: # %bb.0:
613-
; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
614-
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
612+
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
615613
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
616614
; AVX1-NEXT: retq
617615
;
@@ -641,8 +639,7 @@ define <16 x i32> @load_splat_16i32_2i32_0101(<2 x i32>* %vp) {
641639
;
642640
; AVX1-LABEL: load_splat_16i32_2i32_0101:
643641
; AVX1: # %bb.0:
644-
; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
645-
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
642+
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
646643
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
647644
; AVX1-NEXT: vmovaps %ymm0, %ymm1
648645
; AVX1-NEXT: retq

0 commit comments

Comments
 (0)