Skip to content

Commit aca34da

Browse files
committed
Prioritize lowering V{4|16}F32 with blend.
Blend is often fastest available instruction so it should be higher priority for v4f32 and an option for v16f32. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D143856
1 parent f1c8b72 commit aca34da

File tree

3 files changed

+30
-20
lines changed

3 files changed

+30
-20
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15445,6 +15445,11 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1544515445
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
1544615446
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
1544715447

15448+
if (Subtarget.hasSSE41())
15449+
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
15450+
Zeroable, Subtarget, DAG))
15451+
return Blend;
15452+
1544815453
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
1544915454

1545015455
if (NumV2Elements == 0) {
@@ -15498,10 +15503,6 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1549815503
return V;
1549915504

1550015505
if (Subtarget.hasSSE41()) {
15501-
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
15502-
Zeroable, Subtarget, DAG))
15503-
return Blend;
15504-
1550515506
// Use INSERTPS if we can complete the shuffle efficiently.
1550615507
if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
1550715508
return V;
@@ -19082,6 +19083,10 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1908219083
return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
1908319084
}
1908419085

19086+
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
19087+
Zeroable, Subtarget, DAG))
19088+
return Blend;
19089+
1908519090
// Try to create an in-lane repeating shuffle mask and then shuffle the
1908619091
// results into the target lanes.
1908719092
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

llvm/test/CodeGen/X86/avx512-insert-extract.ll

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,23 @@
44
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+avx512vbmi | FileCheck --check-prefixes=CHECK,SKX %s
55

66
define <16 x float> @test1(<16 x float> %x, ptr %br, float %y) nounwind {
7-
; CHECK-LABEL: test1:
8-
; CHECK: ## %bb.0:
9-
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
10-
; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
11-
; CHECK-NEXT: vbroadcastss %xmm1, %zmm1
12-
; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,30,15]
13-
; CHECK-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0
14-
; CHECK-NEXT: retq
7+
; KNL-LABEL: test1:
8+
; KNL: ## %bb.0:
9+
; KNL-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
10+
; KNL-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
11+
; KNL-NEXT: movw $16384, %ax ## imm = 0x4000
12+
; KNL-NEXT: kmovw %eax, %k1
13+
; KNL-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
14+
; KNL-NEXT: retq
15+
;
16+
; SKX-LABEL: test1:
17+
; SKX: ## %bb.0:
18+
; SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
19+
; SKX-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
20+
; SKX-NEXT: movw $16384, %ax ## imm = 0x4000
21+
; SKX-NEXT: kmovd %eax, %k1
22+
; SKX-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
23+
; SKX-NEXT: retq
1524
%rrr = load float, ptr %br
1625
%rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
1726
%rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14

llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -305,19 +305,15 @@ define <16 x float> @merge_16f32_f32_0uu3uuuuuuuuCuEF(ptr %ptr) nounwind uwtable
305305
define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable noinline ssp {
306306
; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
307307
; ALL: # %bb.0:
308-
; ALL-NEXT: vmovups (%rdi), %zmm1
309-
; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2
310-
; ALL-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
311-
; ALL-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0
308+
; ALL-NEXT: vmovdqu64 (%rdi), %zmm0
309+
; ALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
312310
; ALL-NEXT: retq
313311
;
314312
; X86-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
315313
; X86-AVX512F: # %bb.0:
316314
; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
317-
; X86-AVX512F-NEXT: vmovups (%eax), %zmm1
318-
; X86-AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2
319-
; X86-AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
320-
; X86-AVX512F-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0
315+
; X86-AVX512F-NEXT: vmovdqu64 (%eax), %zmm0
316+
; X86-AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
321317
; X86-AVX512F-NEXT: retl
322318
%ptr3 = getelementptr inbounds float, ptr %ptr, i64 3
323319
%ptrC = getelementptr inbounds float, ptr %ptr, i64 12

0 commit comments

Comments
 (0)