Skip to content

Commit 0586023

Browse files
authored
[X86] X86FixupInstTuning - fold BLENDPS -> MOVSD (#144029)
Reduces codesize - make use of free PS<->PD domain transfers (like we do in many other places) and replace a suitable BLENDPS mask with MOVSD if OptSize or the scheduler prefers it
1 parent 5762491 commit 0586023

30 files changed

+258
-391
lines changed

llvm/lib/Target/X86/X86FixupInstTuning.cpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -222,8 +222,9 @@ bool X86FixupInstTuningPass::processInstruction(
222222
return ProcessUNPCKToIntDomain(NewOpc);
223223
};
224224

225-
auto ProcessBLENDToMOV = [&](unsigned MovOpc) -> bool {
226-
if (MI.getOperand(NumOperands - 1).getImm() != 1)
225+
auto ProcessBLENDToMOV = [&](unsigned MovOpc, unsigned Mask,
226+
unsigned MovImm) -> bool {
227+
if ((MI.getOperand(NumOperands - 1).getImm() & Mask) != MovImm)
227228
return false;
228229
bool Force = MF.getFunction().hasOptSize();
229230
if (!Force && !NewOpcPreferable(MovOpc))
@@ -235,14 +236,16 @@ bool X86FixupInstTuningPass::processInstruction(
235236

236237
switch (Opc) {
237238
case X86::BLENDPDrri:
238-
return ProcessBLENDToMOV(X86::MOVSDrr);
239+
return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
239240
case X86::VBLENDPDrri:
240-
return ProcessBLENDToMOV(X86::VMOVSDrr);
241+
return ProcessBLENDToMOV(X86::VMOVSDrr, 0x3, 0x1);
241242

242243
case X86::BLENDPSrri:
243-
return ProcessBLENDToMOV(X86::MOVSSrr);
244+
return ProcessBLENDToMOV(X86::MOVSSrr, 0xF, 0x1) ||
245+
ProcessBLENDToMOV(X86::MOVSDrr, 0xF, 0x3);
244246
case X86::VBLENDPSrri:
245-
return ProcessBLENDToMOV(X86::VMOVSSrr);
247+
return ProcessBLENDToMOV(X86::VMOVSSrr, 0xF, 0x1) ||
248+
ProcessBLENDToMOV(X86::VMOVSDrr, 0xF, 0x3);
246249

247250
case X86::VPERMILPDri:
248251
return ProcessVPERMILPDri(X86::VSHUFPDrri);

llvm/test/CodeGen/X86/avx-insertelt.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ define <4 x double> @insert_f64_firstelt_of_high_subvector(<4 x double> %x, doub
111111
; AVX-LABEL: insert_f64_firstelt_of_high_subvector:
112112
; AVX: # %bb.0:
113113
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
114-
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
114+
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm2[1]
115115
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
116116
; AVX-NEXT: retq
117117
;

llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -300,8 +300,8 @@ declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
300300
define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
301301
; CHECK-LABEL: test_x86_sse41_blendpd:
302302
; CHECK: # %bb.0:
303-
; CHECK-NEXT: vblendps $3, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x03]
304-
; CHECK-NEXT: # xmm0 = xmm0[0,1],xmm1[2,3]
303+
; CHECK-NEXT: vmovsd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf3,0x10,0xc0]
304+
; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[1]
305305
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
306306
%res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1]
307307
ret <2 x double> %res

llvm/test/CodeGen/X86/coalesce_commute_movsd.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,12 @@ define <2 x double> @insert_f64(double %a0, <2 x double> %a1) {
1919
;
2020
; AVX-LABEL: insert_f64:
2121
; AVX: # %bb.0:
22-
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
22+
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2323
; AVX-NEXT: retq
2424
;
2525
; AVX512-LABEL: insert_f64:
2626
; AVX512: # %bb.0:
27-
; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
27+
; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2828
; AVX512-NEXT: retq
2929
%1 = insertelement <2 x double> %a1, double %a0, i32 0
3030
ret <2 x double> %1

llvm/test/CodeGen/X86/combine-and.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ define <4 x i32> @test7(<4 x i32> %A) {
127127
; SSE-LABEL: test7:
128128
; SSE: # %bb.0:
129129
; SSE-NEXT: xorps %xmm1, %xmm1
130-
; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
130+
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
131131
; SSE-NEXT: retq
132132
;
133133
; AVX-LABEL: test7:

llvm/test/CodeGen/X86/combine-or-shuffle.ll

Lines changed: 70 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,10 @@ define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
3131

3232

3333
define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
34-
; SSE2-LABEL: test2:
35-
; SSE2: # %bb.0:
36-
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
37-
; SSE2-NEXT: retq
38-
;
39-
; SSE4-LABEL: test2:
40-
; SSE4: # %bb.0:
41-
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
42-
; SSE4-NEXT: retq
34+
; SSE-LABEL: test2:
35+
; SSE: # %bb.0:
36+
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
37+
; SSE-NEXT: retq
4338
;
4439
; AVX-LABEL: test2:
4540
; AVX: # %bb.0:
@@ -53,15 +48,10 @@ define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
5348

5449

5550
define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
56-
; SSE2-LABEL: test3:
57-
; SSE2: # %bb.0:
58-
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
59-
; SSE2-NEXT: retq
60-
;
61-
; SSE4-LABEL: test3:
62-
; SSE4: # %bb.0:
63-
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
64-
; SSE4-NEXT: retq
51+
; SSE-LABEL: test3:
52+
; SSE: # %bb.0:
53+
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
54+
; SSE-NEXT: retq
6555
;
6656
; AVX-LABEL: test3:
6757
; AVX: # %bb.0:
@@ -201,15 +191,10 @@ define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
201191

202192

203193
define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
204-
; SSE2-LABEL: test9:
205-
; SSE2: # %bb.0:
206-
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
207-
; SSE2-NEXT: retq
208-
;
209-
; SSE4-LABEL: test9:
210-
; SSE4: # %bb.0:
211-
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
212-
; SSE4-NEXT: retq
194+
; SSE-LABEL: test9:
195+
; SSE: # %bb.0:
196+
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
197+
; SSE-NEXT: retq
213198
;
214199
; AVX-LABEL: test9:
215200
; AVX: # %bb.0:
@@ -223,15 +208,10 @@ define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
223208

224209

225210
define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
226-
; SSE2-LABEL: test10:
227-
; SSE2: # %bb.0:
228-
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
229-
; SSE2-NEXT: retq
230-
;
231-
; SSE4-LABEL: test10:
232-
; SSE4: # %bb.0:
233-
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
234-
; SSE4-NEXT: retq
211+
; SSE-LABEL: test10:
212+
; SSE: # %bb.0:
213+
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
214+
; SSE-NEXT: retq
235215
;
236216
; AVX-LABEL: test10:
237217
; AVX: # %bb.0:
@@ -563,20 +543,25 @@ define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
563543
; bitcast to use the mask-or blend combine.
564544

565545
define <2 x double> @test22(<2 x double> %a0, <2 x double> %a1) {
566-
; SSE2-LABEL: test22:
567-
; SSE2: # %bb.0:
568-
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
569-
; SSE2-NEXT: retq
546+
; SSE-LABEL: test22:
547+
; SSE: # %bb.0:
548+
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
549+
; SSE-NEXT: retq
570550
;
571-
; SSE4-LABEL: test22:
572-
; SSE4: # %bb.0:
573-
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
574-
; SSE4-NEXT: retq
551+
; AVX1-LABEL: test22:
552+
; AVX1: # %bb.0:
553+
; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
554+
; AVX1-NEXT: retq
575555
;
576-
; AVX-LABEL: test22:
577-
; AVX: # %bb.0:
578-
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
579-
; AVX-NEXT: retq
556+
; AVX2-LABEL: test22:
557+
; AVX2: # %bb.0:
558+
; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
559+
; AVX2-NEXT: retq
560+
;
561+
; AVX512-LABEL: test22:
562+
; AVX512: # %bb.0:
563+
; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
564+
; AVX512-NEXT: retq
580565
%bc1 = bitcast <2 x double> %a0 to <2 x i64>
581566
%bc2 = bitcast <2 x double> %a1 to <2 x i64>
582567
%and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
@@ -614,20 +599,25 @@ define <4 x float> @test23(<4 x float> %a0, <4 x float> %a1) {
614599

615600

616601
define <4 x float> @test24(<4 x float> %a0, <4 x float> %a1) {
617-
; SSE2-LABEL: test24:
618-
; SSE2: # %bb.0:
619-
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
620-
; SSE2-NEXT: retq
602+
; SSE-LABEL: test24:
603+
; SSE: # %bb.0:
604+
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
605+
; SSE-NEXT: retq
621606
;
622-
; SSE4-LABEL: test24:
623-
; SSE4: # %bb.0:
624-
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
625-
; SSE4-NEXT: retq
607+
; AVX1-LABEL: test24:
608+
; AVX1: # %bb.0:
609+
; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
610+
; AVX1-NEXT: retq
626611
;
627-
; AVX-LABEL: test24:
628-
; AVX: # %bb.0:
629-
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
630-
; AVX-NEXT: retq
612+
; AVX2-LABEL: test24:
613+
; AVX2: # %bb.0:
614+
; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
615+
; AVX2-NEXT: retq
616+
;
617+
; AVX512-LABEL: test24:
618+
; AVX512: # %bb.0:
619+
; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
620+
; AVX512-NEXT: retq
631621
%bc1 = bitcast <4 x float> %a0 to <2 x i64>
632622
%bc2 = bitcast <4 x float> %a1 to <2 x i64>
633623
%and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
@@ -707,15 +697,10 @@ define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
707697
; Verify that we can fold regardless of which operand is the zeroinitializer
708698

709699
define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
710-
; SSE2-LABEL: test2b:
711-
; SSE2: # %bb.0:
712-
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
713-
; SSE2-NEXT: retq
714-
;
715-
; SSE4-LABEL: test2b:
716-
; SSE4: # %bb.0:
717-
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
718-
; SSE4-NEXT: retq
700+
; SSE-LABEL: test2b:
701+
; SSE: # %bb.0:
702+
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
703+
; SSE-NEXT: retq
719704
;
720705
; AVX-LABEL: test2b:
721706
; AVX: # %bb.0:
@@ -728,15 +713,10 @@ define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
728713
}
729714

730715
define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
731-
; SSE2-LABEL: test2c:
732-
; SSE2: # %bb.0:
733-
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
734-
; SSE2-NEXT: retq
735-
;
736-
; SSE4-LABEL: test2c:
737-
; SSE4: # %bb.0:
738-
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
739-
; SSE4-NEXT: retq
716+
; SSE-LABEL: test2c:
717+
; SSE: # %bb.0:
718+
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
719+
; SSE-NEXT: retq
740720
;
741721
; AVX-LABEL: test2c:
742722
; AVX: # %bb.0:
@@ -750,15 +730,10 @@ define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
750730

751731

752732
define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
753-
; SSE2-LABEL: test2d:
754-
; SSE2: # %bb.0:
755-
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
756-
; SSE2-NEXT: retq
757-
;
758-
; SSE4-LABEL: test2d:
759-
; SSE4: # %bb.0:
760-
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
761-
; SSE4-NEXT: retq
733+
; SSE-LABEL: test2d:
734+
; SSE: # %bb.0:
735+
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
736+
; SSE-NEXT: retq
762737
;
763738
; AVX-LABEL: test2d:
764739
; AVX: # %bb.0:
@@ -773,15 +748,10 @@ define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
773748
; Make sure we can have an undef where an index pointing to the zero vector should be
774749

775750
define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
776-
; SSE2-LABEL: test2e:
777-
; SSE2: # %bb.0:
778-
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
779-
; SSE2-NEXT: retq
780-
;
781-
; SSE4-LABEL: test2e:
782-
; SSE4: # %bb.0:
783-
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
784-
; SSE4-NEXT: retq
751+
; SSE-LABEL: test2e:
752+
; SSE: # %bb.0:
753+
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
754+
; SSE-NEXT: retq
785755
;
786756
; AVX-LABEL: test2e:
787757
; AVX: # %bb.0:
@@ -794,15 +764,10 @@ define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
794764
}
795765

796766
define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) {
797-
; SSE2-LABEL: test2f:
798-
; SSE2: # %bb.0:
799-
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
800-
; SSE2-NEXT: retq
801-
;
802-
; SSE4-LABEL: test2f:
803-
; SSE4: # %bb.0:
804-
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
805-
; SSE4-NEXT: retq
767+
; SSE-LABEL: test2f:
768+
; SSE: # %bb.0:
769+
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
770+
; SSE-NEXT: retq
806771
;
807772
; AVX-LABEL: test2f:
808773
; AVX: # %bb.0:

llvm/test/CodeGen/X86/commute-blend-sse41.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ define void @baz(ptr %arg, ptr %arg1) optsize {
5757
; CHECK-NEXT: movaps (%rdi), %xmm0
5858
; CHECK-NEXT: movaps {{.*#+}} xmm1 = [3,3]
5959
; CHECK-NEXT: andps %xmm0, %xmm1
60-
; CHECK-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
60+
; CHECK-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
6161
; CHECK-NEXT: movups %xmm1, (%rsi)
6262
; CHECK-NEXT: retq
6363
bb:

llvm/test/CodeGen/X86/horizontal-sum.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -577,7 +577,7 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <
577577
; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
578578
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
579579
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
580-
; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
580+
; AVX-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = xmm5[0],xmm2[1]
581581
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
582582
; AVX-SLOW-NEXT: vaddps %xmm3, %xmm4, %xmm4
583583
; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
@@ -596,7 +596,7 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <
596596
; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
597597
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
598598
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
599-
; AVX-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
599+
; AVX-FAST-NEXT: vmovsd {{.*#+}} xmm2 = xmm5[0],xmm2[1]
600600
; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm4
601601
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
602602
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]

0 commit comments

Comments
 (0)