Skip to content

Commit 63e2cd6

Browse files
committed
[AVX-512] Teach two address instruction pass to replace masked move instructions with blendm instructions when its beneficial.
Isel now selects masked move instructions for vselect instead of blendm. But sometimes it beneficial to register allocation to remove the tied register constraint by using blendm instructions. This also picks up cases where the masked move was created due to a masked load intrinsic. Differential Revision: https://reviews.llvm.org/D28454 llvm-svn: 292005
1 parent 09b7e0f commit 63e2cd6

16 files changed

+314
-357
lines changed

llvm/lib/Target/X86/X86InstrAVX512.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2738,7 +2738,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
27382738
[(set _.RC:$dst, (_.VT (bitconvert (ld_frag addr:$src))))],
27392739
_.ExeDomain>, EVEX;
27402740

2741-
let Constraints = "$src0 = $dst" in {
2741+
let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in {
27422742
def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
27432743
(ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1),
27442744
!strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",

llvm/lib/Target/X86/X86InstrInfo.cpp

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4044,6 +4044,131 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
40444044
BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest).add(Src),
40454045
MI.getOperand(2));
40464046
break;
4047+
4048+
case X86::VMOVDQU8Z128rmk:
4049+
case X86::VMOVDQU8Z256rmk:
4050+
case X86::VMOVDQU8Zrmk:
4051+
case X86::VMOVDQU16Z128rmk:
4052+
case X86::VMOVDQU16Z256rmk:
4053+
case X86::VMOVDQU16Zrmk:
4054+
case X86::VMOVDQU32Z128rmk: case X86::VMOVDQA32Z128rmk:
4055+
case X86::VMOVDQU32Z256rmk: case X86::VMOVDQA32Z256rmk:
4056+
case X86::VMOVDQU32Zrmk: case X86::VMOVDQA32Zrmk:
4057+
case X86::VMOVDQU64Z128rmk: case X86::VMOVDQA64Z128rmk:
4058+
case X86::VMOVDQU64Z256rmk: case X86::VMOVDQA64Z256rmk:
4059+
case X86::VMOVDQU64Zrmk: case X86::VMOVDQA64Zrmk:
4060+
case X86::VMOVUPDZ128rmk: case X86::VMOVAPDZ128rmk:
4061+
case X86::VMOVUPDZ256rmk: case X86::VMOVAPDZ256rmk:
4062+
case X86::VMOVUPDZrmk: case X86::VMOVAPDZrmk:
4063+
case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk:
4064+
case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk:
4065+
case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk: {
4066+
unsigned Opc;
4067+
switch (MIOpc) {
4068+
default: llvm_unreachable("Unreachable!");
4069+
case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break;
4070+
case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break;
4071+
case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break;
4072+
case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
4073+
case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
4074+
case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break;
4075+
case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
4076+
case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
4077+
case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
4078+
case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
4079+
case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
4080+
case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
4081+
case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
4082+
case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
4083+
case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
4084+
case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
4085+
case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
4086+
case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
4087+
case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
4088+
case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
4089+
case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
4090+
case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
4091+
case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
4092+
case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
4093+
case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
4094+
case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
4095+
case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
4096+
case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
4097+
case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
4098+
case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
4099+
}
4100+
4101+
NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
4102+
.add(Dest)
4103+
.add(MI.getOperand(2))
4104+
.add(Src)
4105+
.add(MI.getOperand(3))
4106+
.add(MI.getOperand(4))
4107+
.add(MI.getOperand(5))
4108+
.add(MI.getOperand(6))
4109+
.add(MI.getOperand(7));
4110+
break;
4111+
}
4112+
case X86::VMOVDQU8Z128rrk:
4113+
case X86::VMOVDQU8Z256rrk:
4114+
case X86::VMOVDQU8Zrrk:
4115+
case X86::VMOVDQU16Z128rrk:
4116+
case X86::VMOVDQU16Z256rrk:
4117+
case X86::VMOVDQU16Zrrk:
4118+
case X86::VMOVDQU32Z128rrk: case X86::VMOVDQA32Z128rrk:
4119+
case X86::VMOVDQU32Z256rrk: case X86::VMOVDQA32Z256rrk:
4120+
case X86::VMOVDQU32Zrrk: case X86::VMOVDQA32Zrrk:
4121+
case X86::VMOVDQU64Z128rrk: case X86::VMOVDQA64Z128rrk:
4122+
case X86::VMOVDQU64Z256rrk: case X86::VMOVDQA64Z256rrk:
4123+
case X86::VMOVDQU64Zrrk: case X86::VMOVDQA64Zrrk:
4124+
case X86::VMOVUPDZ128rrk: case X86::VMOVAPDZ128rrk:
4125+
case X86::VMOVUPDZ256rrk: case X86::VMOVAPDZ256rrk:
4126+
case X86::VMOVUPDZrrk: case X86::VMOVAPDZrrk:
4127+
case X86::VMOVUPSZ128rrk: case X86::VMOVAPSZ128rrk:
4128+
case X86::VMOVUPSZ256rrk: case X86::VMOVAPSZ256rrk:
4129+
case X86::VMOVUPSZrrk: case X86::VMOVAPSZrrk: {
4130+
unsigned Opc;
4131+
switch (MIOpc) {
4132+
default: llvm_unreachable("Unreachable!");
4133+
case X86::VMOVDQU8Z128rrk: Opc = X86::VPBLENDMBZ128rrk; break;
4134+
case X86::VMOVDQU8Z256rrk: Opc = X86::VPBLENDMBZ256rrk; break;
4135+
case X86::VMOVDQU8Zrrk: Opc = X86::VPBLENDMBZrrk; break;
4136+
case X86::VMOVDQU16Z128rrk: Opc = X86::VPBLENDMWZ128rrk; break;
4137+
case X86::VMOVDQU16Z256rrk: Opc = X86::VPBLENDMWZ256rrk; break;
4138+
case X86::VMOVDQU16Zrrk: Opc = X86::VPBLENDMWZrrk; break;
4139+
case X86::VMOVDQU32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
4140+
case X86::VMOVDQU32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
4141+
case X86::VMOVDQU32Zrrk: Opc = X86::VPBLENDMDZrrk; break;
4142+
case X86::VMOVDQU64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
4143+
case X86::VMOVDQU64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
4144+
case X86::VMOVDQU64Zrrk: Opc = X86::VPBLENDMQZrrk; break;
4145+
case X86::VMOVUPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break;
4146+
case X86::VMOVUPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break;
4147+
case X86::VMOVUPDZrrk: Opc = X86::VBLENDMPDZrrk; break;
4148+
case X86::VMOVUPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break;
4149+
case X86::VMOVUPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break;
4150+
case X86::VMOVUPSZrrk: Opc = X86::VBLENDMPSZrrk; break;
4151+
case X86::VMOVDQA32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
4152+
case X86::VMOVDQA32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
4153+
case X86::VMOVDQA32Zrrk: Opc = X86::VPBLENDMDZrrk; break;
4154+
case X86::VMOVDQA64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
4155+
case X86::VMOVDQA64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
4156+
case X86::VMOVDQA64Zrrk: Opc = X86::VPBLENDMQZrrk; break;
4157+
case X86::VMOVAPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break;
4158+
case X86::VMOVAPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break;
4159+
case X86::VMOVAPDZrrk: Opc = X86::VBLENDMPDZrrk; break;
4160+
case X86::VMOVAPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break;
4161+
case X86::VMOVAPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break;
4162+
case X86::VMOVAPSZrrk: Opc = X86::VBLENDMPSZrrk; break;
4163+
}
4164+
4165+
NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
4166+
.add(Dest)
4167+
.add(MI.getOperand(2))
4168+
.add(Src)
4169+
.add(MI.getOperand(3));
4170+
break;
4171+
}
40474172
}
40484173

40494174
if (!NewMI) return nullptr;

llvm/test/CodeGen/X86/avx512-bugfix-26264.ll

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,14 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32
66
; AVX512BW: ## BB#0:
77
; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
88
; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
9-
; AVX512BW-NEXT: vmovupd (%rdi), %zmm1 {%k1}
9+
; AVX512BW-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
1010
; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
11-
; AVX512BW-NEXT: vmovupd 128(%rdi), %zmm3 {%k2}
11+
; AVX512BW-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm5 {%k2}
1212
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
13-
; AVX512BW-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
13+
; AVX512BW-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1}
1414
; AVX512BW-NEXT: kshiftrw $8, %k2, %k1
15-
; AVX512BW-NEXT: vmovupd 192(%rdi), %zmm4 {%k1}
16-
; AVX512BW-NEXT: vmovapd %zmm1, %zmm0
17-
; AVX512BW-NEXT: vmovapd %zmm2, %zmm1
18-
; AVX512BW-NEXT: vmovapd %zmm3, %zmm2
19-
; AVX512BW-NEXT: vmovapd %zmm4, %zmm3
15+
; AVX512BW-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1}
16+
; AVX512BW-NEXT: vmovapd %zmm5, %zmm2
2017
; AVX512BW-NEXT: retq
2118
%res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
2219
ret <32 x double> %res
@@ -27,17 +24,14 @@ define <32 x i64> @test_load_32i64(<32 x i64>* %ptrs, <32 x i1> %mask, <32 x i64
2724
; AVX512BW: ## BB#0:
2825
; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
2926
; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
30-
; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1}
27+
; AVX512BW-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
3128
; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
32-
; AVX512BW-NEXT: vmovdqu64 128(%rdi), %zmm3 {%k2}
29+
; AVX512BW-NEXT: vpblendmq 128(%rdi), %zmm3, %zmm5 {%k2}
3330
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
34-
; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1}
31+
; AVX512BW-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k1}
3532
; AVX512BW-NEXT: kshiftrw $8, %k2, %k1
36-
; AVX512BW-NEXT: vmovdqu64 192(%rdi), %zmm4 {%k1}
37-
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
38-
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1
39-
; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2
40-
; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3
33+
; AVX512BW-NEXT: vpblendmq 192(%rdi), %zmm4, %zmm3 {%k1}
34+
; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2
4135
; AVX512BW-NEXT: retq
4236
%res = call <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64>* %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0)
4337
ret <32 x i64> %res

llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll

Lines changed: 19 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,7 @@ define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float
4343
; AVX512: ## BB#0:
4444
; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2
4545
; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
46-
; AVX512-NEXT: vmovups (%rdi), %zmm1 {%k1}
47-
; AVX512-NEXT: vmovaps %zmm1, %zmm0
46+
; AVX512-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
4847
; AVX512-NEXT: retq
4948
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
5049
%res = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst)
@@ -189,22 +188,18 @@ define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64
189188
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
190189
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
191190
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
192-
; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1}
191+
; AVX512F-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
193192
; AVX512F-NEXT: kshiftrw $8, %k1, %k1
194-
; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1}
195-
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
196-
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1
193+
; AVX512F-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k1}
197194
; AVX512F-NEXT: retq
198195
;
199196
; SKX-LABEL: test_load_16i64:
200197
; SKX: ## BB#0:
201198
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
202199
; SKX-NEXT: vpmovb2m %xmm0, %k1
203-
; SKX-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1}
200+
; SKX-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
204201
; SKX-NEXT: kshiftrw $8, %k1, %k1
205-
; SKX-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1}
206-
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
207-
; SKX-NEXT: vmovdqa64 %zmm2, %zmm1
202+
; SKX-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k1}
208203
; SKX-NEXT: retq
209204
%res = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
210205
ret <16 x i64> %res
@@ -217,22 +212,18 @@ define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16
217212
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
218213
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
219214
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
220-
; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k1}
215+
; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
221216
; AVX512F-NEXT: kshiftrw $8, %k1, %k1
222-
; AVX512F-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
223-
; AVX512F-NEXT: vmovapd %zmm1, %zmm0
224-
; AVX512F-NEXT: vmovapd %zmm2, %zmm1
217+
; AVX512F-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1}
225218
; AVX512F-NEXT: retq
226219
;
227220
; SKX-LABEL: test_load_16f64:
228221
; SKX: ## BB#0:
229222
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
230223
; SKX-NEXT: vpmovb2m %xmm0, %k1
231-
; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1}
224+
; SKX-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
232225
; SKX-NEXT: kshiftrw $8, %k1, %k1
233-
; SKX-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
234-
; SKX-NEXT: vmovapd %zmm1, %zmm0
235-
; SKX-NEXT: vmovapd %zmm2, %zmm1
226+
; SKX-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1}
236227
; SKX-NEXT: retq
237228
%res = call <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
238229
ret <16 x double> %res
@@ -246,36 +237,30 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32
246237
; AVX512F-NEXT: vpmovsxbd %xmm5, %zmm5
247238
; AVX512F-NEXT: vpslld $31, %zmm5, %zmm5
248239
; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k1
249-
; AVX512F-NEXT: vmovupd 128(%rdi), %zmm3 {%k1}
240+
; AVX512F-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm5 {%k1}
250241
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
251242
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
252243
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
253-
; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k2}
244+
; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k2}
254245
; AVX512F-NEXT: kshiftrw $8, %k1, %k1
255-
; AVX512F-NEXT: vmovupd 192(%rdi), %zmm4 {%k1}
246+
; AVX512F-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1}
256247
; AVX512F-NEXT: kshiftrw $8, %k2, %k1
257-
; AVX512F-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
258-
; AVX512F-NEXT: vmovapd %zmm1, %zmm0
259-
; AVX512F-NEXT: vmovapd %zmm2, %zmm1
260-
; AVX512F-NEXT: vmovapd %zmm3, %zmm2
261-
; AVX512F-NEXT: vmovapd %zmm4, %zmm3
248+
; AVX512F-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1}
249+
; AVX512F-NEXT: vmovapd %zmm5, %zmm2
262250
; AVX512F-NEXT: retq
263251
;
264252
; SKX-LABEL: test_load_32f64:
265253
; SKX: ## BB#0:
266254
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
267255
; SKX-NEXT: vpmovb2m %ymm0, %k1
268-
; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1}
256+
; SKX-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
269257
; SKX-NEXT: kshiftrd $16, %k1, %k2
270-
; SKX-NEXT: vmovupd 128(%rdi), %zmm3 {%k2}
258+
; SKX-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm5 {%k2}
271259
; SKX-NEXT: kshiftrw $8, %k1, %k1
272-
; SKX-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
260+
; SKX-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1}
273261
; SKX-NEXT: kshiftrw $8, %k2, %k1
274-
; SKX-NEXT: vmovupd 192(%rdi), %zmm4 {%k1}
275-
; SKX-NEXT: vmovapd %zmm1, %zmm0
276-
; SKX-NEXT: vmovapd %zmm2, %zmm1
277-
; SKX-NEXT: vmovapd %zmm3, %zmm2
278-
; SKX-NEXT: vmovapd %zmm4, %zmm3
262+
; SKX-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1}
263+
; SKX-NEXT: vmovapd %zmm5, %zmm2
279264
; SKX-NEXT: retq
280265
%res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
281266
ret <32 x double> %res

llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,7 @@ define <32 x i8> @test_mask_load_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x
2020
; CHECK: ## BB#0:
2121
; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
2222
; CHECK-NEXT: vpmovb2m %ymm0, %k1
23-
; CHECK-NEXT: vmovdqu8 (%rdi), %ymm1 {%k1}
24-
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
23+
; CHECK-NEXT: vpblendmb (%rdi), %ymm1, %ymm0 {%k1}
2524
; CHECK-NEXT: retq
2625
%res = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* %addr, i32 4, <32 x i1>%mask, <32 x i8> %val)
2726
ret <32 x i8> %res
@@ -33,8 +32,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x
3332
; CHECK: ## BB#0:
3433
; CHECK-NEXT: vpsllw $7, %zmm0, %zmm0
3534
; CHECK-NEXT: vpmovb2m %zmm0, %k1
36-
; CHECK-NEXT: vmovdqu8 (%rdi), %zmm1 {%k1}
37-
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
35+
; CHECK-NEXT: vpblendmb (%rdi), %zmm1, %zmm0 {%k1}
3836
; CHECK-NEXT: retq
3937
%res = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* %addr, i32 4, <64 x i1>%mask, <64 x i8> %val)
4038
ret <64 x i8> %res
@@ -70,8 +68,7 @@ define <32 x i16> @test_mask_load_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32
7068
; CHECK: ## BB#0:
7169
; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
7270
; CHECK-NEXT: vpmovb2m %ymm0, %k1
73-
; CHECK-NEXT: vmovdqu16 (%rdi), %zmm1 {%k1}
74-
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
71+
; CHECK-NEXT: vpblendmw (%rdi), %zmm1, %zmm0 {%k1}
7572
; CHECK-NEXT: retq
7673
%res = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* %addr, i32 4, <32 x i1>%mask, <32 x i16> %val)
7774
ret <32 x i16> %res

llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -325,13 +325,11 @@ define x86_regcallcc [4 x i32]* @test_CallargRetPointer([4 x i32]* %a) {
325325
}
326326

327327
; X32-LABEL: test_argRet128Vector:
328-
; X32: vmovdqa{{.*}} %xmm0, %xmm1
329-
; X32: vmovdqa{{.*}} %xmm1, %xmm0
328+
; X32: vpblend{{.*}} %xmm0, %xmm1, %xmm0
330329
; X32: ret{{.*}}
331330

332331
; WIN64-LABEL: test_argRet128Vector:
333-
; WIN64: vmovdqa{{.*}} %xmm0, %xmm1
334-
; WIN64: vmovdqa{{.*}} %xmm1, %xmm0
332+
; WIN64: vpblend{{.*}} %xmm0, %xmm1, %xmm0
335333
; WIN64: ret{{.*}}
336334

337335
; Test regcall when receiving/returning 128 bit vector
@@ -360,13 +358,11 @@ define x86_regcallcc <4 x i32> @test_CallargRet128Vector(<4 x i32> %a) {
360358
}
361359

362360
; X32-LABEL: test_argRet256Vector:
363-
; X32: vmovdqa{{.*}} %ymm0, %ymm1
364-
; X32: vmovdqa{{.*}} %ymm1, %ymm0
361+
; X32: vpblend{{.*}} %ymm0, %ymm1, %ymm0
365362
; X32: ret{{.*}}
366363

367364
; WIN64-LABEL: test_argRet256Vector:
368-
; WIN64: vmovdqa{{.*}} %ymm0, %ymm1
369-
; WIN64: vmovdqa{{.*}} %ymm1, %ymm0
365+
; WIN64: vpblend{{.*}} %ymm0, %ymm1, %ymm0
370366
; WIN64: ret{{.*}}
371367

372368
; Test regcall when receiving/returning 256 bit vector
@@ -395,13 +391,11 @@ define x86_regcallcc <8 x i32> @test_CallargRet256Vector(<8 x i32> %a) {
395391
}
396392

397393
; X32-LABEL: test_argRet512Vector:
398-
; X32: vmovdqa{{.*}} %zmm0, %zmm1
399-
; X32: vmovdqa{{.*}} %zmm1, %zmm0
394+
; X32: vpblend{{.*}} %zmm0, %zmm1, %zmm0
400395
; X32: ret{{.*}}
401396

402397
; WIN64-LABEL: test_argRet512Vector:
403-
; WIN64: vmovdqa{{.*}} %zmm0, %zmm1
404-
; WIN64: vmovdqa{{.*}} %zmm1, %zmm0
398+
; WIN64: vpblend{{.*}} %zmm0, %zmm1, %zmm0
405399
; WIN64: ret{{.*}}
406400

407401
; Test regcall when receiving/returning 512 bit vector

0 commit comments

Comments
 (0)