Skip to content

Commit d9183fd

Browse files
authored
[X86] LowerSelect - use BLENDV for scalar selection on all SSE41+ targets (#125853)
When we first began (2015) to lower f32/f64 selects to X86ISD::BLENDV(scalar_to_vector(),scalar_to_vector(),scalar_to_vector()), we limited it to AVX targets to avoid issues with SSE41's xmm0 constraint for the condition mask. Since then we've seen general improvements in TwoAddressInstruction and better handling of condition commutation for X86ISD::BLENDV nodes, which should address many of the original concerns of using SSE41 BLENDVPD/S. In most cases we will replace 3 logic instruction with the BLENDV node and (up to 3) additional moves. Although the BLENDV is often more expensive on original SSE41 targets, this should still be an improvement in a majority of cases. We also have no equivalent restrictions for SSE41 for v2f64/v4f32 vector selection. Fixes #105807
1 parent f845497 commit d9183fd

File tree

11 files changed

+782
-738
lines changed

11 files changed

+782
-738
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24648,19 +24648,14 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
2464824648
SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
2464924649
DAG.getTargetConstant(SSECC, DL, MVT::i8));
2465024650

24651-
// If we have AVX, we can use a variable vector select (VBLENDV) instead
24652-
// of 3 logic instructions for size savings and potentially speed.
24651+
// If we have SSE41/AVX, we can use a variable vector select (VBLENDV)
24652+
// instead of 3 logic instructions for size savings and potentially speed.
2465324653
// Unfortunately, there is no scalar form of VBLENDV.
24654-
24654+
//
2465524655
// If either operand is a +0.0 constant, don't try this. We can expect to
2465624656
// optimize away at least one of the logic instructions later in that
2465724657
// case, so that sequence would be faster than a variable blend.
24658-
24659-
// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
24660-
// uses XMM0 as the selection register. That may need just as many
24661-
// instructions as the AND/ANDN/OR sequence due to register moves, so
24662-
// don't bother.
24663-
if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
24658+
if (Subtarget.hasSSE41() && !isNullFPConstant(Op1) &&
2466424659
!isNullFPConstant(Op2)) {
2466524660
// Convert to vectors, do a VSELECT, and convert back to scalar.
2466624661
// All of the conversions should be optimized away.

llvm/test/CodeGen/X86/fmaxnum.ll

Lines changed: 80 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,26 @@ declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>)
2222
; FIXME: As the vector tests show, the SSE run shouldn't need this many moves.
2323

2424
define float @test_fmaxf(float %x, float %y) {
25-
; SSE-LABEL: test_fmaxf:
26-
; SSE: # %bb.0:
27-
; SSE-NEXT: movaps %xmm0, %xmm2
28-
; SSE-NEXT: cmpunordss %xmm0, %xmm2
29-
; SSE-NEXT: movaps %xmm2, %xmm3
30-
; SSE-NEXT: andps %xmm1, %xmm3
31-
; SSE-NEXT: maxss %xmm0, %xmm1
32-
; SSE-NEXT: andnps %xmm1, %xmm2
33-
; SSE-NEXT: orps %xmm3, %xmm2
34-
; SSE-NEXT: movaps %xmm2, %xmm0
35-
; SSE-NEXT: retq
25+
; SSE2-LABEL: test_fmaxf:
26+
; SSE2: # %bb.0:
27+
; SSE2-NEXT: movaps %xmm0, %xmm2
28+
; SSE2-NEXT: cmpunordss %xmm0, %xmm2
29+
; SSE2-NEXT: movaps %xmm2, %xmm3
30+
; SSE2-NEXT: andps %xmm1, %xmm3
31+
; SSE2-NEXT: maxss %xmm0, %xmm1
32+
; SSE2-NEXT: andnps %xmm1, %xmm2
33+
; SSE2-NEXT: orps %xmm3, %xmm2
34+
; SSE2-NEXT: movaps %xmm2, %xmm0
35+
; SSE2-NEXT: retq
36+
;
37+
; SSE4-LABEL: test_fmaxf:
38+
; SSE4: # %bb.0:
39+
; SSE4-NEXT: movaps %xmm1, %xmm2
40+
; SSE4-NEXT: maxss %xmm0, %xmm2
41+
; SSE4-NEXT: cmpunordss %xmm0, %xmm0
42+
; SSE4-NEXT: blendvps %xmm0, %xmm1, %xmm2
43+
; SSE4-NEXT: movaps %xmm2, %xmm0
44+
; SSE4-NEXT: retq
3645
;
3746
; AVX1-LABEL: test_fmaxf:
3847
; AVX1: # %bb.0:
@@ -63,17 +72,26 @@ define float @test_fmaxf_minsize(float %x, float %y) minsize {
6372
; FIXME: As the vector tests show, the SSE run shouldn't need this many moves.
6473

6574
define double @test_fmax(double %x, double %y) {
66-
; SSE-LABEL: test_fmax:
67-
; SSE: # %bb.0:
68-
; SSE-NEXT: movapd %xmm0, %xmm2
69-
; SSE-NEXT: cmpunordsd %xmm0, %xmm2
70-
; SSE-NEXT: movapd %xmm2, %xmm3
71-
; SSE-NEXT: andpd %xmm1, %xmm3
72-
; SSE-NEXT: maxsd %xmm0, %xmm1
73-
; SSE-NEXT: andnpd %xmm1, %xmm2
74-
; SSE-NEXT: orpd %xmm3, %xmm2
75-
; SSE-NEXT: movapd %xmm2, %xmm0
76-
; SSE-NEXT: retq
75+
; SSE2-LABEL: test_fmax:
76+
; SSE2: # %bb.0:
77+
; SSE2-NEXT: movapd %xmm0, %xmm2
78+
; SSE2-NEXT: cmpunordsd %xmm0, %xmm2
79+
; SSE2-NEXT: movapd %xmm2, %xmm3
80+
; SSE2-NEXT: andpd %xmm1, %xmm3
81+
; SSE2-NEXT: maxsd %xmm0, %xmm1
82+
; SSE2-NEXT: andnpd %xmm1, %xmm2
83+
; SSE2-NEXT: orpd %xmm3, %xmm2
84+
; SSE2-NEXT: movapd %xmm2, %xmm0
85+
; SSE2-NEXT: retq
86+
;
87+
; SSE4-LABEL: test_fmax:
88+
; SSE4: # %bb.0:
89+
; SSE4-NEXT: movapd %xmm1, %xmm2
90+
; SSE4-NEXT: maxsd %xmm0, %xmm2
91+
; SSE4-NEXT: cmpunordsd %xmm0, %xmm0
92+
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm2
93+
; SSE4-NEXT: movapd %xmm2, %xmm0
94+
; SSE4-NEXT: retq
7795
;
7896
; AVX1-LABEL: test_fmax:
7997
; AVX1: # %bb.0:
@@ -111,17 +129,26 @@ define x86_fp80 @test_fmaxl(x86_fp80 %x, x86_fp80 %y) {
111129
}
112130

113131
define float @test_intrinsic_fmaxf(float %x, float %y) {
114-
; SSE-LABEL: test_intrinsic_fmaxf:
115-
; SSE: # %bb.0:
116-
; SSE-NEXT: movaps %xmm0, %xmm2
117-
; SSE-NEXT: cmpunordss %xmm0, %xmm2
118-
; SSE-NEXT: movaps %xmm2, %xmm3
119-
; SSE-NEXT: andps %xmm1, %xmm3
120-
; SSE-NEXT: maxss %xmm0, %xmm1
121-
; SSE-NEXT: andnps %xmm1, %xmm2
122-
; SSE-NEXT: orps %xmm3, %xmm2
123-
; SSE-NEXT: movaps %xmm2, %xmm0
124-
; SSE-NEXT: retq
132+
; SSE2-LABEL: test_intrinsic_fmaxf:
133+
; SSE2: # %bb.0:
134+
; SSE2-NEXT: movaps %xmm0, %xmm2
135+
; SSE2-NEXT: cmpunordss %xmm0, %xmm2
136+
; SSE2-NEXT: movaps %xmm2, %xmm3
137+
; SSE2-NEXT: andps %xmm1, %xmm3
138+
; SSE2-NEXT: maxss %xmm0, %xmm1
139+
; SSE2-NEXT: andnps %xmm1, %xmm2
140+
; SSE2-NEXT: orps %xmm3, %xmm2
141+
; SSE2-NEXT: movaps %xmm2, %xmm0
142+
; SSE2-NEXT: retq
143+
;
144+
; SSE4-LABEL: test_intrinsic_fmaxf:
145+
; SSE4: # %bb.0:
146+
; SSE4-NEXT: movaps %xmm1, %xmm2
147+
; SSE4-NEXT: maxss %xmm0, %xmm2
148+
; SSE4-NEXT: cmpunordss %xmm0, %xmm0
149+
; SSE4-NEXT: blendvps %xmm0, %xmm1, %xmm2
150+
; SSE4-NEXT: movaps %xmm2, %xmm0
151+
; SSE4-NEXT: retq
125152
;
126153
; AVX1-LABEL: test_intrinsic_fmaxf:
127154
; AVX1: # %bb.0:
@@ -142,17 +169,26 @@ define float @test_intrinsic_fmaxf(float %x, float %y) {
142169
}
143170

144171
define double @test_intrinsic_fmax(double %x, double %y) {
145-
; SSE-LABEL: test_intrinsic_fmax:
146-
; SSE: # %bb.0:
147-
; SSE-NEXT: movapd %xmm0, %xmm2
148-
; SSE-NEXT: cmpunordsd %xmm0, %xmm2
149-
; SSE-NEXT: movapd %xmm2, %xmm3
150-
; SSE-NEXT: andpd %xmm1, %xmm3
151-
; SSE-NEXT: maxsd %xmm0, %xmm1
152-
; SSE-NEXT: andnpd %xmm1, %xmm2
153-
; SSE-NEXT: orpd %xmm3, %xmm2
154-
; SSE-NEXT: movapd %xmm2, %xmm0
155-
; SSE-NEXT: retq
172+
; SSE2-LABEL: test_intrinsic_fmax:
173+
; SSE2: # %bb.0:
174+
; SSE2-NEXT: movapd %xmm0, %xmm2
175+
; SSE2-NEXT: cmpunordsd %xmm0, %xmm2
176+
; SSE2-NEXT: movapd %xmm2, %xmm3
177+
; SSE2-NEXT: andpd %xmm1, %xmm3
178+
; SSE2-NEXT: maxsd %xmm0, %xmm1
179+
; SSE2-NEXT: andnpd %xmm1, %xmm2
180+
; SSE2-NEXT: orpd %xmm3, %xmm2
181+
; SSE2-NEXT: movapd %xmm2, %xmm0
182+
; SSE2-NEXT: retq
183+
;
184+
; SSE4-LABEL: test_intrinsic_fmax:
185+
; SSE4: # %bb.0:
186+
; SSE4-NEXT: movapd %xmm1, %xmm2
187+
; SSE4-NEXT: maxsd %xmm0, %xmm2
188+
; SSE4-NEXT: cmpunordsd %xmm0, %xmm0
189+
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm2
190+
; SSE4-NEXT: movapd %xmm2, %xmm0
191+
; SSE4-NEXT: retq
156192
;
157193
; AVX1-LABEL: test_intrinsic_fmax:
158194
; AVX1: # %bb.0:

llvm/test/CodeGen/X86/fminnum.ll

Lines changed: 80 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,26 @@ declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>)
2222
; FIXME: As the vector tests show, the SSE run shouldn't need this many moves.
2323

2424
define float @test_fminf(float %x, float %y) {
25-
; SSE-LABEL: test_fminf:
26-
; SSE: # %bb.0:
27-
; SSE-NEXT: movaps %xmm0, %xmm2
28-
; SSE-NEXT: cmpunordss %xmm0, %xmm2
29-
; SSE-NEXT: movaps %xmm2, %xmm3
30-
; SSE-NEXT: andps %xmm1, %xmm3
31-
; SSE-NEXT: minss %xmm0, %xmm1
32-
; SSE-NEXT: andnps %xmm1, %xmm2
33-
; SSE-NEXT: orps %xmm3, %xmm2
34-
; SSE-NEXT: movaps %xmm2, %xmm0
35-
; SSE-NEXT: retq
25+
; SSE2-LABEL: test_fminf:
26+
; SSE2: # %bb.0:
27+
; SSE2-NEXT: movaps %xmm0, %xmm2
28+
; SSE2-NEXT: cmpunordss %xmm0, %xmm2
29+
; SSE2-NEXT: movaps %xmm2, %xmm3
30+
; SSE2-NEXT: andps %xmm1, %xmm3
31+
; SSE2-NEXT: minss %xmm0, %xmm1
32+
; SSE2-NEXT: andnps %xmm1, %xmm2
33+
; SSE2-NEXT: orps %xmm3, %xmm2
34+
; SSE2-NEXT: movaps %xmm2, %xmm0
35+
; SSE2-NEXT: retq
36+
;
37+
; SSE4-LABEL: test_fminf:
38+
; SSE4: # %bb.0:
39+
; SSE4-NEXT: movaps %xmm1, %xmm2
40+
; SSE4-NEXT: minss %xmm0, %xmm2
41+
; SSE4-NEXT: cmpunordss %xmm0, %xmm0
42+
; SSE4-NEXT: blendvps %xmm0, %xmm1, %xmm2
43+
; SSE4-NEXT: movaps %xmm2, %xmm0
44+
; SSE4-NEXT: retq
3645
;
3746
; AVX1-LABEL: test_fminf:
3847
; AVX1: # %bb.0:
@@ -63,17 +72,26 @@ define float @test_fminf_minsize(float %x, float %y) minsize {
6372
; FIXME: As the vector tests show, the SSE run shouldn't need this many moves.
6473

6574
define double @test_fmin(double %x, double %y) {
66-
; SSE-LABEL: test_fmin:
67-
; SSE: # %bb.0:
68-
; SSE-NEXT: movapd %xmm0, %xmm2
69-
; SSE-NEXT: cmpunordsd %xmm0, %xmm2
70-
; SSE-NEXT: movapd %xmm2, %xmm3
71-
; SSE-NEXT: andpd %xmm1, %xmm3
72-
; SSE-NEXT: minsd %xmm0, %xmm1
73-
; SSE-NEXT: andnpd %xmm1, %xmm2
74-
; SSE-NEXT: orpd %xmm3, %xmm2
75-
; SSE-NEXT: movapd %xmm2, %xmm0
76-
; SSE-NEXT: retq
75+
; SSE2-LABEL: test_fmin:
76+
; SSE2: # %bb.0:
77+
; SSE2-NEXT: movapd %xmm0, %xmm2
78+
; SSE2-NEXT: cmpunordsd %xmm0, %xmm2
79+
; SSE2-NEXT: movapd %xmm2, %xmm3
80+
; SSE2-NEXT: andpd %xmm1, %xmm3
81+
; SSE2-NEXT: minsd %xmm0, %xmm1
82+
; SSE2-NEXT: andnpd %xmm1, %xmm2
83+
; SSE2-NEXT: orpd %xmm3, %xmm2
84+
; SSE2-NEXT: movapd %xmm2, %xmm0
85+
; SSE2-NEXT: retq
86+
;
87+
; SSE4-LABEL: test_fmin:
88+
; SSE4: # %bb.0:
89+
; SSE4-NEXT: movapd %xmm1, %xmm2
90+
; SSE4-NEXT: minsd %xmm0, %xmm2
91+
; SSE4-NEXT: cmpunordsd %xmm0, %xmm0
92+
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm2
93+
; SSE4-NEXT: movapd %xmm2, %xmm0
94+
; SSE4-NEXT: retq
7795
;
7896
; AVX1-LABEL: test_fmin:
7997
; AVX1: # %bb.0:
@@ -111,17 +129,26 @@ define x86_fp80 @test_fminl(x86_fp80 %x, x86_fp80 %y) {
111129
}
112130

113131
define float @test_intrinsic_fminf(float %x, float %y) {
114-
; SSE-LABEL: test_intrinsic_fminf:
115-
; SSE: # %bb.0:
116-
; SSE-NEXT: movaps %xmm0, %xmm2
117-
; SSE-NEXT: cmpunordss %xmm0, %xmm2
118-
; SSE-NEXT: movaps %xmm2, %xmm3
119-
; SSE-NEXT: andps %xmm1, %xmm3
120-
; SSE-NEXT: minss %xmm0, %xmm1
121-
; SSE-NEXT: andnps %xmm1, %xmm2
122-
; SSE-NEXT: orps %xmm3, %xmm2
123-
; SSE-NEXT: movaps %xmm2, %xmm0
124-
; SSE-NEXT: retq
132+
; SSE2-LABEL: test_intrinsic_fminf:
133+
; SSE2: # %bb.0:
134+
; SSE2-NEXT: movaps %xmm0, %xmm2
135+
; SSE2-NEXT: cmpunordss %xmm0, %xmm2
136+
; SSE2-NEXT: movaps %xmm2, %xmm3
137+
; SSE2-NEXT: andps %xmm1, %xmm3
138+
; SSE2-NEXT: minss %xmm0, %xmm1
139+
; SSE2-NEXT: andnps %xmm1, %xmm2
140+
; SSE2-NEXT: orps %xmm3, %xmm2
141+
; SSE2-NEXT: movaps %xmm2, %xmm0
142+
; SSE2-NEXT: retq
143+
;
144+
; SSE4-LABEL: test_intrinsic_fminf:
145+
; SSE4: # %bb.0:
146+
; SSE4-NEXT: movaps %xmm1, %xmm2
147+
; SSE4-NEXT: minss %xmm0, %xmm2
148+
; SSE4-NEXT: cmpunordss %xmm0, %xmm0
149+
; SSE4-NEXT: blendvps %xmm0, %xmm1, %xmm2
150+
; SSE4-NEXT: movaps %xmm2, %xmm0
151+
; SSE4-NEXT: retq
125152
;
126153
; AVX1-LABEL: test_intrinsic_fminf:
127154
; AVX1: # %bb.0:
@@ -142,17 +169,26 @@ define float @test_intrinsic_fminf(float %x, float %y) {
142169
}
143170

144171
define double @test_intrinsic_fmin(double %x, double %y) {
145-
; SSE-LABEL: test_intrinsic_fmin:
146-
; SSE: # %bb.0:
147-
; SSE-NEXT: movapd %xmm0, %xmm2
148-
; SSE-NEXT: cmpunordsd %xmm0, %xmm2
149-
; SSE-NEXT: movapd %xmm2, %xmm3
150-
; SSE-NEXT: andpd %xmm1, %xmm3
151-
; SSE-NEXT: minsd %xmm0, %xmm1
152-
; SSE-NEXT: andnpd %xmm1, %xmm2
153-
; SSE-NEXT: orpd %xmm3, %xmm2
154-
; SSE-NEXT: movapd %xmm2, %xmm0
155-
; SSE-NEXT: retq
172+
; SSE2-LABEL: test_intrinsic_fmin:
173+
; SSE2: # %bb.0:
174+
; SSE2-NEXT: movapd %xmm0, %xmm2
175+
; SSE2-NEXT: cmpunordsd %xmm0, %xmm2
176+
; SSE2-NEXT: movapd %xmm2, %xmm3
177+
; SSE2-NEXT: andpd %xmm1, %xmm3
178+
; SSE2-NEXT: minsd %xmm0, %xmm1
179+
; SSE2-NEXT: andnpd %xmm1, %xmm2
180+
; SSE2-NEXT: orpd %xmm3, %xmm2
181+
; SSE2-NEXT: movapd %xmm2, %xmm0
182+
; SSE2-NEXT: retq
183+
;
184+
; SSE4-LABEL: test_intrinsic_fmin:
185+
; SSE4: # %bb.0:
186+
; SSE4-NEXT: movapd %xmm1, %xmm2
187+
; SSE4-NEXT: minsd %xmm0, %xmm2
188+
; SSE4-NEXT: cmpunordsd %xmm0, %xmm0
189+
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm2
190+
; SSE4-NEXT: movapd %xmm2, %xmm0
191+
; SSE4-NEXT: retq
156192
;
157193
; AVX1-LABEL: test_intrinsic_fmin:
158194
; AVX1: # %bb.0:

llvm/test/CodeGen/X86/fp-select-cmp-and.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -189,10 +189,9 @@ define float @test17(float %a, float %b, float %c, float %eps) {
189189
; CHECK-LABEL: test17:
190190
; CHECK: # %bb.0:
191191
; CHECK-NEXT: cmpless %xmm0, %xmm3
192-
; CHECK-NEXT: andps %xmm3, %xmm2
193-
; CHECK-NEXT: andnps %xmm1, %xmm3
194-
; CHECK-NEXT: orps %xmm2, %xmm3
195192
; CHECK-NEXT: movaps %xmm3, %xmm0
193+
; CHECK-NEXT: blendvps %xmm0, %xmm2, %xmm1
194+
; CHECK-NEXT: movaps %xmm1, %xmm0
196195
; CHECK-NEXT: retq
197196
%cmp = fcmp oge float %a, %eps
198197
%cond = select i1 %cmp, float %c, float %b
@@ -203,10 +202,9 @@ define double @test18(double %a, double %b, double %c, double %eps) {
203202
; CHECK-LABEL: test18:
204203
; CHECK: # %bb.0:
205204
; CHECK-NEXT: cmplesd %xmm0, %xmm3
206-
; CHECK-NEXT: andpd %xmm3, %xmm2
207-
; CHECK-NEXT: andnpd %xmm1, %xmm3
208-
; CHECK-NEXT: orpd %xmm2, %xmm3
209205
; CHECK-NEXT: movapd %xmm3, %xmm0
206+
; CHECK-NEXT: blendvpd %xmm0, %xmm2, %xmm1
207+
; CHECK-NEXT: movapd %xmm1, %xmm0
210208
; CHECK-NEXT: retq
211209
%cmp = fcmp oge double %a, %eps
212210
%cond = select i1 %cmp, double %c, double %b

0 commit comments

Comments
 (0)