-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] X86FixupInstTuning - fold BLENDPS -> MOVSD #144029
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Reduces codesize - make use of free PS<->PD domain transfers (like we do in many other places) and replace a suitable BLENDPS mask with MOVSD if OptSize or the scheduler prefers it
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesReduces codesize - make use of free PS<->PD domain transfers (like we do in many other places) and replace a suitable BLENDPS mask with MOVSD if OptSize or the scheduler prefers it Patch is 79.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/144029.diff 30 Files Affected:
diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index be0a8c23ea5c4..6f2cef83d56b6 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -222,8 +222,9 @@ bool X86FixupInstTuningPass::processInstruction(
return ProcessUNPCKToIntDomain(NewOpc);
};
- auto ProcessBLENDToMOV = [&](unsigned MovOpc) -> bool {
- if (MI.getOperand(NumOperands - 1).getImm() != 1)
+ auto ProcessBLENDToMOV = [&](unsigned MovOpc, unsigned Mask,
+ unsigned MovImm) -> bool {
+ if ((MI.getOperand(NumOperands - 1).getImm() & Mask) != MovImm)
return false;
bool Force = MF.getFunction().hasOptSize();
if (!Force && !NewOpcPreferable(MovOpc))
@@ -235,14 +236,16 @@ bool X86FixupInstTuningPass::processInstruction(
switch (Opc) {
case X86::BLENDPDrri:
- return ProcessBLENDToMOV(X86::MOVSDrr);
+ return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
case X86::VBLENDPDrri:
- return ProcessBLENDToMOV(X86::VMOVSDrr);
+ return ProcessBLENDToMOV(X86::VMOVSDrr, 0x3, 0x1);
case X86::BLENDPSrri:
- return ProcessBLENDToMOV(X86::MOVSSrr);
+ return ProcessBLENDToMOV(X86::MOVSSrr, 0xFF, 0x1) ||
+ ProcessBLENDToMOV(X86::MOVSDrr, 0xFF, 0x3);
case X86::VBLENDPSrri:
- return ProcessBLENDToMOV(X86::VMOVSSrr);
+ return ProcessBLENDToMOV(X86::VMOVSSrr, 0xFF, 0x1) ||
+ ProcessBLENDToMOV(X86::VMOVSDrr, 0xFF, 0x3);
case X86::VPERMILPDri:
return ProcessVPERMILPDri(X86::VSHUFPDrri);
diff --git a/llvm/test/CodeGen/X86/avx-insertelt.ll b/llvm/test/CodeGen/X86/avx-insertelt.ll
index 02e6c9649c9a1..f8feceb0404b5 100644
--- a/llvm/test/CodeGen/X86/avx-insertelt.ll
+++ b/llvm/test/CodeGen/X86/avx-insertelt.ll
@@ -111,7 +111,7 @@ define <4 x double> @insert_f64_firstelt_of_high_subvector(<4 x double> %x, doub
; AVX-LABEL: insert_f64_firstelt_of_high_subvector:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm2[1]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
index 966662f5f9f8f..f0203b3b889e4 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -300,8 +300,8 @@ declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_sse41_blendpd:
; CHECK: # %bb.0:
-; CHECK-NEXT: vblendps $3, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x03]
-; CHECK-NEXT: # xmm0 = xmm0[0,1],xmm1[2,3]
+; CHECK-NEXT: vmovsd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf3,0x10,0xc0]
+; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[1]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
diff --git a/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll b/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
index 086df87d1d5ff..441c79b3fc31f 100644
--- a/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
+++ b/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
@@ -19,12 +19,12 @@ define <2 x double> @insert_f64(double %a0, <2 x double> %a1) {
;
; AVX-LABEL: insert_f64:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
;
; AVX512-LABEL: insert_f64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX512-NEXT: retq
%1 = insertelement <2 x double> %a1, double %a0, i32 0
ret <2 x double> %1
diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll
index 9ca4ebfec2774..a476b21979cef 100644
--- a/llvm/test/CodeGen/X86/combine-and.ll
+++ b/llvm/test/CodeGen/X86/combine-and.ll
@@ -127,7 +127,7 @@ define <4 x i32> @test7(<4 x i32> %A) {
; SSE-LABEL: test7:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE-NEXT: retq
;
; AVX-LABEL: test7:
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 2f2a05fa6939b..14e3767f65564 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -31,15 +31,10 @@ define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE4-LABEL: test2:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; SSE-LABEL: test2:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test2:
; AVX: # %bb.0:
@@ -53,15 +48,10 @@ define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
-; SSE2-LABEL: test3:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE4-LABEL: test3:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; SSE-LABEL: test3:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test3:
; AVX: # %bb.0:
@@ -201,15 +191,10 @@ define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test9:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE4-LABEL: test9:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; SSE-LABEL: test9:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test9:
; AVX: # %bb.0:
@@ -223,15 +208,10 @@ define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
-; SSE2-LABEL: test10:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE4-LABEL: test10:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; SSE-LABEL: test10:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test10:
; AVX: # %bb.0:
@@ -563,20 +543,25 @@ define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
; bitcast to use the mask-or blend combine.
define <2 x double> @test22(<2 x double> %a0, <2 x double> %a1) {
-; SSE2-LABEL: test22:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
+; SSE-LABEL: test22:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
-; SSE4-LABEL: test22:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; AVX1-LABEL: test22:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT: retq
;
-; AVX-LABEL: test22:
-; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX-NEXT: retq
+; AVX2-LABEL: test22:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test22:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512-NEXT: retq
%bc1 = bitcast <2 x double> %a0 to <2 x i64>
%bc2 = bitcast <2 x double> %a1 to <2 x i64>
%and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
@@ -614,20 +599,25 @@ define <4 x float> @test23(<4 x float> %a0, <4 x float> %a1) {
define <4 x float> @test24(<4 x float> %a0, <4 x float> %a1) {
-; SSE2-LABEL: test24:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
+; SSE-LABEL: test24:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
-; SSE4-LABEL: test24:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; AVX1-LABEL: test24:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT: retq
;
-; AVX-LABEL: test24:
-; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX-NEXT: retq
+; AVX2-LABEL: test24:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test24:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512-NEXT: retq
%bc1 = bitcast <4 x float> %a0 to <2 x i64>
%bc2 = bitcast <4 x float> %a1 to <2 x i64>
%and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
@@ -707,15 +697,10 @@ define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
; Verify that we can fold regardless of which operand is the zeroinitializer
define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2b:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE4-LABEL: test2b:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; SSE-LABEL: test2b:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test2b:
; AVX: # %bb.0:
@@ -728,15 +713,10 @@ define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
}
define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2c:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE4-LABEL: test2c:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; SSE-LABEL: test2c:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test2c:
; AVX: # %bb.0:
@@ -750,15 +730,10 @@ define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2d:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE4-LABEL: test2d:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; SSE-LABEL: test2d:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test2d:
; AVX: # %bb.0:
@@ -773,15 +748,10 @@ define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
; Make sure we can have an undef where an index pointing to the zero vector should be
define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2e:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE4-LABEL: test2e:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; SSE-LABEL: test2e:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test2e:
; AVX: # %bb.0:
@@ -794,15 +764,10 @@ define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
}
define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2f:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE4-LABEL: test2f:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; SSE-LABEL: test2f:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test2f:
; AVX: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/commute-blend-sse41.ll b/llvm/test/CodeGen/X86/commute-blend-sse41.ll
index 07d6a8ba22bb1..4740bf59a69e7 100644
--- a/llvm/test/CodeGen/X86/commute-blend-sse41.ll
+++ b/llvm/test/CodeGen/X86/commute-blend-sse41.ll
@@ -57,7 +57,7 @@ define void @baz(ptr %arg, ptr %arg1) optsize {
; CHECK-NEXT: movaps (%rdi), %xmm0
; CHECK-NEXT: movaps {{.*#+}} xmm1 = [3,3]
; CHECK-NEXT: andps %xmm0, %xmm1
-; CHECK-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; CHECK-NEXT: movups %xmm1, (%rsi)
; CHECK-NEXT: retq
bb:
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index 443275e11459d..0afc4f784bc5e 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -577,7 +577,7 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <
; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
-; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
+; AVX-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = xmm5[0],xmm2[1]
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
; AVX-SLOW-NEXT: vaddps %xmm3, %xmm4, %xmm4
; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
@@ -596,7 +596,7 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <
; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
-; AVX-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
+; AVX-FAST-NEXT: vmovsd {{.*#+}} xmm2 = xmm5[0],xmm2[1]
; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm4
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll b/llvm/test/CodeGen/X86/insertelement-zero.ll
index 6036eddb0ca84..b66ad07c466e1 100644
--- a/llvm/test/CodeGen/X86/insertelement-zero.ll
+++ b/llvm/test/CodeGen/X86/insertelement-zero.ll
@@ -30,13 +30,13 @@ define <2 x double> @insert_v2f64_z1(<2 x double> %a) {
; SSE41-LABEL: insert_v2f64_z1:
; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v2f64_z1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
%1 = insertelement <2 x double> %a, double 0.0, i32 0
ret <2 x double> %1
@@ -68,7 +68,7 @@ define <4 x double> @insert_v4f64_0zz3(<4 x double> %a) {
; SSE41: # %bb.0:
; SSE41-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE41-NEXT: xorps %xmm2, %xmm2
-; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; SSE41-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v4f64_0zz3:
@@ -103,7 +103,7 @@ define <2 x i64> @insert_v2i64_z1(<2 x i64> %a) {
; SSE41-LABEL: insert_v2i64_z1:
; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v2i64_z1:
@@ -137,7 +137,7 @@ define <4 x i64> @insert_v4i64_01z3(<4 x i64> %a) {
; SSE41-LABEL: insert_v4i64_01z3:
; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm2, %xmm2
-; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; SSE41-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v4i64_01z3:
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 37ab4276fbcca..8c4bab99a5b7b 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -6512,7 +6512,7 @@ define <8 x float> @mload_constmask_v8f32(ptr %addr, <8 x float> %dst) {
; SSE42-LABEL: mload_constmask_v8f32:
; SSE42: ## %bb.0:
; SSE42-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; SSE42-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; SSE42-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/sse-insertelt.ll b/llvm/test/CodeGen/X86/sse-insertelt.ll
index f174eaaca38c2..72e002ed6b7db 100644
--- a/llvm/test/CodeGen/X86/sse-insertelt.ll
+++ b/llvm/test/CodeGen/X86/sse-insertelt.ll
@@ -21,19 +21,14 @@ define <4 x float> @insert_f32_firstelt(<4 x float> %x, float %s) {
}
define <2 x double> @insert_f64_firstelt(<2 x double> %x, double %s) {
-; SSE2-LABEL: insert_f64_firstelt:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: insert_f64_firstelt:
-; SSE41: # %bb.0:
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT: retq
+; SSE-LABEL: insert_f64_firstelt:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: insert_f64_firstelt:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
%i0 = insertelement <2 x double> %x, double %s, i32 0
ret <2 x double> %i0
diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
index 12bfb8d4fc9cf..325f735b09cd9 100644
--- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE,X86-SSE2
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE,X86-SSE41
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X86-AVX,X86-AVX1
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,X86-AVX,X86-AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE,X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE,X64-SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X64-AVX,X64-AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,X64-AVX,X64-AVX512
@@ -1333,29 +1333,17 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
}
define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; X86-SSE2-LABEL: add_sd_mask:
-; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: testb $1, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: jne .LBB71_1
-; X86-SSE2-NEXT: # %bb.2:
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; X86-SSE2-NEXT: retl
-; X86-SSE2-NEXT: .LBB71_1:
-; X86-SSE2-NEXT: addsd %xmm0, %xmm1
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X86-SSE2-NEXT: retl
-;
-; X86-S...
[truncated]
|
; AVX512-LABEL: test22: | ||
; AVX512: # %bb.0: | ||
; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why this not been replaced?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it shows the scheduler checks are working :) we use -mcpu=x86-64-v4 and skylakeserver model prefers vblendps to vmovsd - I can replace the -mcpu with raw -mattr if you prefer?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see. I think it's better to keep it as is to show the coverage.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/88/builds/12773 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/130/builds/13715 Here is the relevant piece of the build log for the reference
|
Reduces codesize - make use of free PS<->PD domain transfers (like we do in many other places) and replace a suitable BLENDPS mask with MOVSD if OptSize or the scheduler prefers it
Reduces codesize - make use of free PS<->PD domain transfers (like we do in many other places) and replace a suitable BLENDPS mask with MOVSD if OptSize or the scheduler prefers it
Reduces codesize - make use of free PS<->PD domain transfers (like we do in many other places) and replace a suitable BLENDPS mask with MOVSD if OptSize or the scheduler prefers it