Skip to content

Commit f9e5db1

Browse files
committed
[X86] foldMemoryOperandCustom - attempt to load-fold with a VEXTRACTF128/I128 into a regular load
This is mainly to help remove subvector extractions from spilled YMM registers. We can extend this for all the AVX512 variants, I've tried to make the implementation ready for this (a quick test indicated its mainly we're missing test coverage for AVX512). What I'm not sure on is how best we can then fold this new smaller load into another instruction (you can see some examples of this in vector-interleaved-load-i32-stride-8.ll)? The comment still saying "32-byte Reload" is annoying, but we already have this for many other element/subvector load folds. Noticed while looking at next steps after llvm#129695
1 parent d775b91 commit f9e5db1

File tree

8 files changed

+752
-1145
lines changed

8 files changed

+752
-1145
lines changed

llvm/lib/Target/X86/X86InstrInfo.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7395,6 +7395,36 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
73957395
}
73967396
}
73977397
break;
7398+
case X86::VEXTRACTF128rri:
7399+
case X86::VEXTRACTI128rri:
7400+
// Replaces subvector extraction with a load.
7401+
// TODO: Add AVX512 variants.
7402+
if (OpNum == 1) {
7403+
unsigned Idx = MI.getOperand(MI.getNumOperands() - 1).getImm();
7404+
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7405+
const TargetRegisterClass *RC = getRegClass(MI.getDesc(), 0, &RI, MF);
7406+
unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7407+
assert((RCSize == 16) && "Unexpected dst register size");
7408+
int PtrOffset = Idx * RCSize;
7409+
7410+
unsigned NewOpCode;
7411+
switch (MI.getOpcode()) {
7412+
case X86::VEXTRACTF128rri:
7413+
NewOpCode = Alignment < Align(RCSize) ? X86::VMOVUPSrm : X86::VMOVAPSrm;
7414+
break;
7415+
case X86::VEXTRACTI128rri:
7416+
NewOpCode = Alignment < Align(RCSize) ? X86::VMOVDQUrm : X86::VMOVDQArm;
7417+
break;
7418+
default:
7419+
llvm_unreachable("Unexpected EXTRACT_SUBVECTOR instruction");
7420+
}
7421+
7422+
MachineInstr *NewMI =
7423+
fuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
7424+
NewMI->removeOperand(NewMI->getNumOperands() - 1);
7425+
return NewMI;
7426+
}
7427+
break;
73987428
case X86::MOV32r0:
73997429
if (auto *NewMI =
74007430
makeM0Inst(*this, (Size == 4) ? X86::MOV32mi : X86::MOV64mi32, MOs,

llvm/test/CodeGen/X86/bfloat.ll

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1743,11 +1743,9 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
17431743
; AVXNC-NEXT: # xmm0 = mem[1,0]
17441744
; AVXNC-NEXT: callq __truncdfbf2@PLT
17451745
; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1746-
; AVXNC-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1747-
; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0
1746+
; AVXNC-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload
17481747
; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
17491748
; AVXNC-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1750-
; AVXNC-NEXT: vzeroupper
17511749
; AVXNC-NEXT: callq __truncdfbf2@PLT
17521750
; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
17531751
; AVXNC-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
@@ -1759,10 +1757,8 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
17591757
; AVXNC-NEXT: # xmm0 = mem[1,0]
17601758
; AVXNC-NEXT: callq __truncdfbf2@PLT
17611759
; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1762-
; AVXNC-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
1763-
; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0
1760+
; AVXNC-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload
17641761
; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1765-
; AVXNC-NEXT: vzeroupper
17661762
; AVXNC-NEXT: callq __truncdfbf2@PLT
17671763
; AVXNC-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
17681764
; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload

llvm/test/CodeGen/X86/fma.ll

Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1221,23 +1221,20 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float>
12211221
; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
12221222
; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A]
12231223
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
1224-
; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18]
12251224
; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
12261225
; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x0c,0x01,0x00,0x00]
1226+
; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18]
1227+
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 32-byte Reload
1228+
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0x70,0x01,0x00,0x00]
1229+
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 32-byte Reload
1230+
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0x90,0x01,0x00,0x00]
12271231
; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
1228-
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
1229-
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
1230-
; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
1231-
; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
1232-
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x80,0x00,0x00,0x00]
1233-
; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
1234-
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
1235-
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
1236-
; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
1237-
; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
1238-
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60]
1239-
; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
1240-
; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
1232+
; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
1233+
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0x80,0x00,0x00,0x00]
1234+
; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03]
1235+
; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
1236+
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x60]
1237+
; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03]
12411238
; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A]
12421239
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
12431240
; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
@@ -1981,21 +1978,18 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
19811978
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfb,0x10,0x45,0x20]
19821979
; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
19831980
; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xc0,0x00,0x00,0x00]
1981+
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 32-byte Reload
1982+
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0x10,0x01,0x00,0x00]
19841983
; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
1985-
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
1986-
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x00,0x01,0x00,0x00]
1987-
; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01]
1988-
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
1989-
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00]
1984+
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 32-byte Reload
1985+
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xf0,0x00,0x00,0x00]
19901986
; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
19911987
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x20]
1992-
; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
19931988
; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
19941989
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x30]
19951990
; FMACALL32_BDVER2-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x15,0xc1]
19961991
; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[1],xmm1[1]
19971992
; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
1998-
; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
19991993
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
20001994
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
20011995
; FMACALL32_BDVER2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero

llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -290,11 +290,9 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
290290
; CHECK-AVX2-NEXT: # xmm0 = mem[3,3,3,3]
291291
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
292292
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
293-
; CHECK-AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
294-
; CHECK-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
293+
; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload
295294
; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
296295
; CHECK-AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
297-
; CHECK-AVX2-NEXT: vzeroupper
298296
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
299297
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
300298
; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload

llvm/test/CodeGen/X86/frem.ll

Lines changed: 22 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -130,14 +130,10 @@ define void @frem_v16f32(<16 x float> %a0, <16 x float> %a1, ptr%p3) nounwind {
130130
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
131131
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
132132
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
133-
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
134-
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
135-
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
136-
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
137-
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
133+
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload
134+
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
135+
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload
138136
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
139-
; CHECK-NEXT: vmovaps %xmm2, %xmm0
140-
; CHECK-NEXT: vzeroupper
141137
; CHECK-NEXT: callq fmodf@PLT
142138
; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
143139
; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -195,14 +191,10 @@ define void @frem_v16f32(<16 x float> %a0, <16 x float> %a1, ptr%p3) nounwind {
195191
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
196192
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
197193
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
198-
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
199-
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
200-
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
201-
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
202-
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
194+
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload
195+
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
196+
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload
203197
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
204-
; CHECK-NEXT: vmovaps %xmm2, %xmm0
205-
; CHECK-NEXT: vzeroupper
206198
; CHECK-NEXT: callq fmodf@PLT
207199
; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
208200
; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -280,14 +272,10 @@ define void @frem_v8f32(<8 x float> %a0, <8 x float> %a1, ptr%p3) nounwind {
280272
; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
281273
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
282274
; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
283-
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
284-
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
285-
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
286-
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
287-
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
275+
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload
276+
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
277+
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload
288278
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
289-
; CHECK-NEXT: vmovaps %xmm2, %xmm0
290-
; CHECK-NEXT: vzeroupper
291279
; CHECK-NEXT: callq fmodf@PLT
292280
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
293281
; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -389,14 +377,10 @@ define void @frem_v8f64(<8 x double> %a0, <8 x double> %a1, ptr%p3) nounwind {
389377
; CHECK-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
390378
; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
391379
; CHECK-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
392-
; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
393-
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
394-
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
395-
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
396-
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
380+
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload
381+
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
382+
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload
397383
; CHECK-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
398-
; CHECK-NEXT: vmovaps %xmm2, %xmm0
399-
; CHECK-NEXT: vzeroupper
400384
; CHECK-NEXT: callq fmod@PLT
401385
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
402386
; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -422,14 +406,10 @@ define void @frem_v8f64(<8 x double> %a0, <8 x double> %a1, ptr%p3) nounwind {
422406
; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload
423407
; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
424408
; CHECK-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
425-
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
426-
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
427-
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
428-
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
429-
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
409+
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload
410+
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
411+
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload
430412
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
431-
; CHECK-NEXT: vmovaps %xmm2, %xmm0
432-
; CHECK-NEXT: vzeroupper
433413
; CHECK-NEXT: callq fmod@PLT
434414
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
435415
; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -475,14 +455,10 @@ define void @frem_v4f64(<4 x double> %a0, <4 x double> %a1, ptr%p3) nounwind {
475455
; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload
476456
; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
477457
; CHECK-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
478-
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
479-
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
480-
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
481-
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
482-
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
458+
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload
459+
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
460+
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload
483461
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
484-
; CHECK-NEXT: vmovaps %xmm2, %xmm0
485-
; CHECK-NEXT: vzeroupper
486462
; CHECK-NEXT: callq fmod@PLT
487463
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
488464
; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -545,11 +521,9 @@ define void @frem_v32f16(<32 x half> %a0, <32 x half> %a1, ptr%p3) nounwind {
545521
; CHECK-NEXT: vzeroupper
546522
; CHECK-NEXT: callq __extendhfsf2@PLT
547523
; CHECK-NEXT: vmovd %xmm0, (%rsp) # 4-byte Folded Spill
548-
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
549-
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
524+
; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload
550525
; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
551526
; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
552-
; CHECK-NEXT: vzeroupper
553527
; CHECK-NEXT: callq __extendhfsf2@PLT
554528
; CHECK-NEXT: vmovss (%rsp), %xmm1 # 4-byte Reload
555529
; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero
@@ -773,18 +747,15 @@ define void @frem_v32f16(<32 x half> %a0, <32 x half> %a1, ptr%p3) nounwind {
773747
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
774748
; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
775749
; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
776-
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
777-
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
750+
; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload
778751
; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
779752
; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
780753
; CHECK-NEXT: vzeroupper
781754
; CHECK-NEXT: callq __extendhfsf2@PLT
782755
; CHECK-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
783-
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
784-
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
756+
; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload
785757
; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
786758
; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
787-
; CHECK-NEXT: vzeroupper
788759
; CHECK-NEXT: callq __extendhfsf2@PLT
789760
; CHECK-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
790761
; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero
@@ -1033,11 +1004,9 @@ define void @frem_v16f16(<16 x half> %a0, <16 x half> %a1, ptr%p3) nounwind {
10331004
; CHECK-NEXT: vzeroupper
10341005
; CHECK-NEXT: callq __extendhfsf2@PLT
10351006
; CHECK-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1036-
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1037-
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
1007+
; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload
10381008
; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10391009
; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1040-
; CHECK-NEXT: vzeroupper
10411010
; CHECK-NEXT: callq __extendhfsf2@PLT
10421011
; CHECK-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
10431012
; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero

0 commit comments

Comments
 (0)