Skip to content

Commit 54da543

Browse files
authored
[SelectionDAG] Avoid one comparison when legalizing fmaximum (#142732)
When ordering signed zero, only check the sign of one of the values. We already know at this point that both values must be +/-0.0, so it is sufficient to check one of them to correctly order them. For example, for fmaximum, if we know LHS is `+0.0` then we can always select LHS, value of RHS does not matter. If LHS is `-0.0` we can always select RHS, value of RHS doesn't matter.
1 parent 01a6d0f commit 54da543

File tree

9 files changed

+773
-958
lines changed

9 files changed

+773
-958
lines changed

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8610,19 +8610,16 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
86108610
// fminimum/fmaximum requires -0.0 less than +0.0
86118611
if (!MinMaxMustRespectOrderedZero && !N->getFlags().hasNoSignedZeros() &&
86128612
!DAG.isKnownNeverZeroFloat(RHS) && !DAG.isKnownNeverZeroFloat(LHS)) {
8613-
auto IsSpecificZero = [&](SDValue F) {
8614-
FloatSignAsInt State;
8615-
DAG.getSignAsIntValue(State, DL, F);
8616-
return DAG.getSetCC(DL, CCVT, State.IntValue,
8617-
DAG.getConstant(0, DL, State.IntValue.getValueType()),
8618-
IsMax ? ISD::SETEQ : ISD::SETNE);
8619-
};
86208613
SDValue IsZero = DAG.getSetCC(DL, CCVT, MinMax,
86218614
DAG.getConstantFP(0.0, DL, VT), ISD::SETOEQ);
8622-
SDValue LCmp =
8623-
DAG.getSelect(DL, VT, IsSpecificZero(LHS), LHS, MinMax, Flags);
8624-
SDValue RCmp = DAG.getSelect(DL, VT, IsSpecificZero(RHS), RHS, LCmp, Flags);
8625-
MinMax = DAG.getSelect(DL, VT, IsZero, RCmp, MinMax, Flags);
8615+
FloatSignAsInt State;
8616+
DAG.getSignAsIntValue(State, DL, LHS);
8617+
SDValue IsSpecificZero =
8618+
DAG.getSetCC(DL, CCVT, State.IntValue,
8619+
DAG.getConstant(0, DL, State.IntValue.getValueType()),
8620+
IsMax ? ISD::SETEQ : ISD::SETNE);
8621+
SDValue Sel = DAG.getSelect(DL, VT, IsSpecificZero, LHS, RHS, Flags);
8622+
MinMax = DAG.getSelect(DL, VT, IsZero, Sel, MinMax, Flags);
86268623
}
86278624

86288625
return MinMax;

llvm/test/CodeGen/AArch64/fmaximum-legalization.ll

Lines changed: 27 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -46,51 +46,46 @@ define fp128 @maximum_fp128(fp128 %x, fp128 %y) nounwind {
4646
; CHECK-LABEL: maximum_fp128:
4747
; CHECK: // %bb.0:
4848
; CHECK-NEXT: sub sp, sp, #96
49+
; CHECK-NEXT: str q0, [sp, #64]
50+
; CHECK-NEXT: mov v2.16b, v1.16b
51+
; CHECK-NEXT: ldrb w8, [sp, #79]
4952
; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
50-
; CHECK-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill
51-
; CHECK-NEXT: stp q1, q0, [sp, #48]
52-
; CHECK-NEXT: bl __gttf2
53-
; CHECK-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload
54-
; CHECK-NEXT: cmp w0, #0
55-
; CHECK-NEXT: b.le .LBB1_2
53+
; CHECK-NEXT: cmp w8, #0
54+
; CHECK-NEXT: b.ne .LBB1_2
5655
; CHECK-NEXT: // %bb.1:
57-
; CHECK-NEXT: mov v1.16b, v0.16b
56+
; CHECK-NEXT: mov v2.16b, v0.16b
5857
; CHECK-NEXT: .LBB1_2:
5958
; CHECK-NEXT: str q1, [sp, #32] // 16-byte Folded Spill
60-
; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
61-
; CHECK-NEXT: bl __unordtf2
62-
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
59+
; CHECK-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
60+
; CHECK-NEXT: stp q2, q0, [sp] // 32-byte Folded Spill
61+
; CHECK-NEXT: bl __gttf2
62+
; CHECK-NEXT: ldp q0, q1, [sp, #16] // 32-byte Folded Reload
6363
; CHECK-NEXT: cmp w0, #0
64-
; CHECK-NEXT: b.eq .LBB1_4
64+
; CHECK-NEXT: mov v2.16b, v1.16b
65+
; CHECK-NEXT: b.le .LBB1_4
6566
; CHECK-NEXT: // %bb.3:
66-
; CHECK-NEXT: adrp x8, .LCPI1_0
67-
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI1_0]
67+
; CHECK-NEXT: mov v2.16b, v0.16b
6868
; CHECK-NEXT: .LBB1_4:
69-
; CHECK-NEXT: ldrb w8, [sp, #79]
70-
; CHECK-NEXT: mov v1.16b, v0.16b
71-
; CHECK-NEXT: cmp w8, #0
72-
; CHECK-NEXT: b.ne .LBB1_6
69+
; CHECK-NEXT: str q2, [sp, #48] // 16-byte Folded Spill
70+
; CHECK-NEXT: bl __unordtf2
71+
; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
72+
; CHECK-NEXT: cmp w0, #0
73+
; CHECK-NEXT: b.eq .LBB1_6
7374
; CHECK-NEXT: // %bb.5:
74-
; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
75+
; CHECK-NEXT: adrp x8, .LCPI1_0
76+
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI1_0]
7577
; CHECK-NEXT: .LBB1_6:
76-
; CHECK-NEXT: ldrb w8, [sp, #63]
77-
; CHECK-NEXT: cmp w8, #0
78-
; CHECK-NEXT: b.ne .LBB1_8
79-
; CHECK-NEXT: // %bb.7:
80-
; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
81-
; CHECK-NEXT: .LBB1_8:
78+
; CHECK-NEXT: str q0, [sp, #48] // 16-byte Folded Spill
8279
; CHECK-NEXT: adrp x8, .LCPI1_1
83-
; CHECK-NEXT: str q0, [sp, #32] // 16-byte Folded Spill
84-
; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill
8580
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_1]
86-
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
81+
; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
8782
; CHECK-NEXT: bl __eqtf2
88-
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
83+
; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
8984
; CHECK-NEXT: cmp w0, #0
90-
; CHECK-NEXT: b.ne .LBB1_10
91-
; CHECK-NEXT: // %bb.9:
92-
; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
93-
; CHECK-NEXT: .LBB1_10:
85+
; CHECK-NEXT: b.ne .LBB1_8
86+
; CHECK-NEXT: // %bb.7:
87+
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
88+
; CHECK-NEXT: .LBB1_8:
9489
; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
9590
; CHECK-NEXT: add sp, sp, #96
9691
; CHECK-NEXT: ret

llvm/test/CodeGen/ARM/fp-maximum-legalization.ll

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,40 +4,34 @@
44
define double @maximum_double(double %x, double %y) nounwind {
55
; CHECK-LABEL: maximum_double:
66
; CHECK: @ %bb.0:
7-
; CHECK-NEXT: sub sp, sp, #16
7+
; CHECK-NEXT: sub sp, sp, #8
88
; CHECK-NEXT: vmov d17, r2, r3
99
; CHECK-NEXT: mov r2, #0
1010
; CHECK-NEXT: vmov d16, r0, r1
1111
; CHECK-NEXT: mov r3, #0
1212
; CHECK-NEXT: vcmp.f64 d16, d17
1313
; CHECK-NEXT: mov r0, #0
1414
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
15-
; CHECK-NEXT: vstr d16, [sp, #8]
16-
; CHECK-NEXT: vstr d17, [sp]
17-
; CHECK-NEXT: ldrb r1, [sp, #15]
15+
; CHECK-NEXT: vldr d18, .LCPI0_0
1816
; CHECK-NEXT: vmov.f64 d19, d17
17+
; CHECK-NEXT: vstr d16, [sp]
18+
; CHECK-NEXT: ldrb r1, [sp, #7]
1919
; CHECK-NEXT: clz r1, r1
20-
; CHECK-NEXT: vldr d18, .LCPI0_0
2120
; CHECK-NEXT: movwvs r2, #1
2221
; CHECK-NEXT: movwgt r3, #1
2322
; CHECK-NEXT: cmp r3, #0
2423
; CHECK-NEXT: vmovne.f64 d19, d16
2524
; CHECK-NEXT: cmp r2, #0
26-
; CHECK-NEXT: ldrb r2, [sp, #7]
2725
; CHECK-NEXT: vmovne.f64 d19, d18
2826
; CHECK-NEXT: lsrs r1, r1, #5
29-
; CHECK-NEXT: clz r1, r2
3027
; CHECK-NEXT: vcmp.f64 d19, #0
31-
; CHECK-NEXT: vmov.f64 d18, d19
32-
; CHECK-NEXT: vmovne.f64 d18, d16
33-
; CHECK-NEXT: lsrs r1, r1, #5
34-
; CHECK-NEXT: vmovne.f64 d18, d17
28+
; CHECK-NEXT: vmovne.f64 d17, d16
3529
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
3630
; CHECK-NEXT: movweq r0, #1
3731
; CHECK-NEXT: cmp r0, #0
38-
; CHECK-NEXT: vmovne.f64 d19, d18
32+
; CHECK-NEXT: vmovne.f64 d19, d17
3933
; CHECK-NEXT: vmov r0, r1, d19
40-
; CHECK-NEXT: add sp, sp, #16
34+
; CHECK-NEXT: add sp, sp, #8
4135
; CHECK-NEXT: bx lr
4236
; CHECK-NEXT: .p2align 3
4337
; CHECK-NEXT: @ %bb.1:

llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Lines changed: 32 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1351,30 +1351,28 @@ define bfloat @test_roundeven(bfloat %a) {
13511351
define bfloat @test_maximum(bfloat %a, bfloat %b) {
13521352
; SM70-LABEL: test_maximum(
13531353
; SM70: {
1354-
; SM70-NEXT: .reg .pred %p<6>;
1355-
; SM70-NEXT: .reg .b16 %rs<8>;
1354+
; SM70-NEXT: .reg .pred %p<5>;
1355+
; SM70-NEXT: .reg .b16 %rs<7>;
13561356
; SM70-NEXT: .reg .b32 %r<7>;
13571357
; SM70-EMPTY:
13581358
; SM70-NEXT: // %bb.0:
13591359
; SM70-NEXT: ld.param.b16 %rs1, [test_maximum_param_0];
1360+
; SM70-NEXT: setp.eq.s16 %p1, %rs1, 0;
13601361
; SM70-NEXT: ld.param.b16 %rs2, [test_maximum_param_1];
1362+
; SM70-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1;
13611363
; SM70-NEXT: cvt.u32.u16 %r1, %rs2;
13621364
; SM70-NEXT: shl.b32 %r2, %r1, 16;
13631365
; SM70-NEXT: cvt.u32.u16 %r3, %rs1;
13641366
; SM70-NEXT: shl.b32 %r4, %r3, 16;
1365-
; SM70-NEXT: setp.gt.f32 %p1, %r4, %r2;
1366-
; SM70-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1;
1367-
; SM70-NEXT: setp.nan.f32 %p2, %r4, %r2;
1368-
; SM70-NEXT: selp.b16 %rs4, 0x7FC0, %rs3, %p2;
1369-
; SM70-NEXT: setp.eq.s16 %p3, %rs1, 0;
1370-
; SM70-NEXT: selp.b16 %rs5, %rs1, %rs4, %p3;
1371-
; SM70-NEXT: setp.eq.s16 %p4, %rs2, 0;
1372-
; SM70-NEXT: selp.b16 %rs6, %rs2, %rs5, %p4;
1373-
; SM70-NEXT: cvt.u32.u16 %r5, %rs4;
1367+
; SM70-NEXT: setp.gt.f32 %p2, %r4, %r2;
1368+
; SM70-NEXT: selp.b16 %rs4, %rs1, %rs2, %p2;
1369+
; SM70-NEXT: setp.nan.f32 %p3, %r4, %r2;
1370+
; SM70-NEXT: selp.b16 %rs5, 0x7FC0, %rs4, %p3;
1371+
; SM70-NEXT: cvt.u32.u16 %r5, %rs5;
13741372
; SM70-NEXT: shl.b32 %r6, %r5, 16;
1375-
; SM70-NEXT: setp.eq.f32 %p5, %r6, 0f00000000;
1376-
; SM70-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5;
1377-
; SM70-NEXT: st.param.b16 [func_retval0], %rs7;
1373+
; SM70-NEXT: setp.eq.f32 %p4, %r6, 0f00000000;
1374+
; SM70-NEXT: selp.b16 %rs6, %rs3, %rs5, %p4;
1375+
; SM70-NEXT: st.param.b16 [func_retval0], %rs6;
13781376
; SM70-NEXT: ret;
13791377
;
13801378
; SM80-LABEL: test_maximum(
@@ -1475,48 +1473,44 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) {
14751473
define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
14761474
; SM70-LABEL: test_maximum_v2(
14771475
; SM70: {
1478-
; SM70-NEXT: .reg .pred %p<11>;
1479-
; SM70-NEXT: .reg .b16 %rs<19>;
1476+
; SM70-NEXT: .reg .pred %p<9>;
1477+
; SM70-NEXT: .reg .b16 %rs<15>;
14801478
; SM70-NEXT: .reg .b32 %r<16>;
14811479
; SM70-EMPTY:
14821480
; SM70-NEXT: // %bb.0:
14831481
; SM70-NEXT: ld.param.b32 %r1, [test_maximum_v2_param_0];
14841482
; SM70-NEXT: ld.param.b32 %r2, [test_maximum_v2_param_1];
14851483
; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2;
1484+
; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1;
1485+
; SM70-NEXT: setp.eq.s16 %p1, %rs4, 0;
1486+
; SM70-NEXT: selp.b16 %rs7, %rs4, %rs2, %p1;
14861487
; SM70-NEXT: cvt.u32.u16 %r3, %rs2;
14871488
; SM70-NEXT: shl.b32 %r4, %r3, 16;
1488-
; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1;
14891489
; SM70-NEXT: cvt.u32.u16 %r5, %rs4;
14901490
; SM70-NEXT: shl.b32 %r6, %r5, 16;
1491-
; SM70-NEXT: setp.gt.f32 %p1, %r6, %r4;
1492-
; SM70-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
1493-
; SM70-NEXT: setp.nan.f32 %p2, %r6, %r4;
1494-
; SM70-NEXT: selp.b16 %rs6, 0x7FC0, %rs5, %p2;
1495-
; SM70-NEXT: setp.eq.s16 %p3, %rs4, 0;
1496-
; SM70-NEXT: selp.b16 %rs9, %rs4, %rs6, %p3;
1497-
; SM70-NEXT: setp.eq.s16 %p4, %rs2, 0;
1498-
; SM70-NEXT: selp.b16 %rs12, %rs2, %rs9, %p4;
1499-
; SM70-NEXT: cvt.u32.u16 %r7, %rs6;
1491+
; SM70-NEXT: setp.gt.f32 %p2, %r6, %r4;
1492+
; SM70-NEXT: selp.b16 %rs8, %rs4, %rs2, %p2;
1493+
; SM70-NEXT: setp.nan.f32 %p3, %r6, %r4;
1494+
; SM70-NEXT: selp.b16 %rs9, 0x7FC0, %rs8, %p3;
1495+
; SM70-NEXT: cvt.u32.u16 %r7, %rs9;
15001496
; SM70-NEXT: shl.b32 %r8, %r7, 16;
1501-
; SM70-NEXT: setp.eq.f32 %p5, %r8, 0f00000000;
1502-
; SM70-NEXT: selp.b16 %rs13, %rs12, %rs6, %p5;
1497+
; SM70-NEXT: setp.eq.f32 %p4, %r8, 0f00000000;
1498+
; SM70-NEXT: selp.b16 %rs10, %rs7, %rs9, %p4;
1499+
; SM70-NEXT: setp.eq.s16 %p5, %rs3, 0;
1500+
; SM70-NEXT: selp.b16 %rs11, %rs3, %rs1, %p5;
15031501
; SM70-NEXT: cvt.u32.u16 %r9, %rs1;
15041502
; SM70-NEXT: shl.b32 %r10, %r9, 16;
15051503
; SM70-NEXT: cvt.u32.u16 %r11, %rs3;
15061504
; SM70-NEXT: shl.b32 %r12, %r11, 16;
15071505
; SM70-NEXT: setp.gt.f32 %p6, %r12, %r10;
1508-
; SM70-NEXT: selp.b16 %rs14, %rs3, %rs1, %p6;
1506+
; SM70-NEXT: selp.b16 %rs12, %rs3, %rs1, %p6;
15091507
; SM70-NEXT: setp.nan.f32 %p7, %r12, %r10;
1510-
; SM70-NEXT: selp.b16 %rs15, 0x7FC0, %rs14, %p7;
1511-
; SM70-NEXT: setp.eq.s16 %p8, %rs3, 0;
1512-
; SM70-NEXT: selp.b16 %rs16, %rs3, %rs15, %p8;
1513-
; SM70-NEXT: setp.eq.s16 %p9, %rs1, 0;
1514-
; SM70-NEXT: selp.b16 %rs17, %rs1, %rs16, %p9;
1515-
; SM70-NEXT: cvt.u32.u16 %r13, %rs15;
1508+
; SM70-NEXT: selp.b16 %rs13, 0x7FC0, %rs12, %p7;
1509+
; SM70-NEXT: cvt.u32.u16 %r13, %rs13;
15161510
; SM70-NEXT: shl.b32 %r14, %r13, 16;
1517-
; SM70-NEXT: setp.eq.f32 %p10, %r14, 0f00000000;
1518-
; SM70-NEXT: selp.b16 %rs18, %rs17, %rs15, %p10;
1519-
; SM70-NEXT: mov.b32 %r15, {%rs18, %rs13};
1511+
; SM70-NEXT: setp.eq.f32 %p8, %r14, 0f00000000;
1512+
; SM70-NEXT: selp.b16 %rs14, %rs11, %rs13, %p8;
1513+
; SM70-NEXT: mov.b32 %r15, {%rs14, %rs10};
15201514
; SM70-NEXT: st.param.b32 [func_retval0], %r15;
15211515
; SM70-NEXT: ret;
15221516
;

0 commit comments

Comments
 (0)