Skip to content

Commit 4c98f5b

Browse files
authored
[DAG] Use copysign in frem power-2 fold. (#91751)
As a small addition to #91148, this uses copysign to produce the correct sign for zero when converting frem to div/trunc/mul when we do not know that the input is positive (and we care about sign bits). The copysign lets us get the sign of zero correct. In testing, the only case this produced different results than fmod was: frem -inf, 4.0 -> nan vs -nan
1 parent ba8a2ad commit 4c98f5b

File tree

3 files changed

+117
-18
lines changed

3 files changed

+117
-18
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17386,15 +17386,20 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
1738617386
TLI.isOperationLegalOrCustom(ISD::FMUL, VT) &&
1738717387
TLI.isOperationLegalOrCustom(ISD::FDIV, VT) &&
1738817388
TLI.isOperationLegalOrCustom(ISD::FTRUNC, VT) &&
17389-
DAG.isKnownToBeAPowerOfTwoFP(N1) &&
17390-
(Flags.hasNoSignedZeros() || DAG.cannotBeOrderedNegativeFP(N0))) {
17389+
DAG.isKnownToBeAPowerOfTwoFP(N1)) {
17390+
bool NeedsCopySign =
17391+
!Flags.hasNoSignedZeros() && !DAG.cannotBeOrderedNegativeFP(N0);
1739117392
SDValue Div = DAG.getNode(ISD::FDIV, DL, VT, N0, N1);
1739217393
SDValue Rnd = DAG.getNode(ISD::FTRUNC, DL, VT, Div);
17393-
if (TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT))
17394-
return DAG.getNode(ISD::FMA, DL, VT, DAG.getNode(ISD::FNEG, DL, VT, Rnd),
17395-
N1, N0);
17396-
SDValue Mul = DAG.getNode(ISD::FMUL, DL, VT, Rnd, N1);
17397-
return DAG.getNode(ISD::FSUB, DL, VT, N0, Mul);
17394+
SDValue MLA;
17395+
if (TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) {
17396+
MLA = DAG.getNode(ISD::FMA, DL, VT, DAG.getNode(ISD::FNEG, DL, VT, Rnd),
17397+
N1, N0);
17398+
} else {
17399+
SDValue Mul = DAG.getNode(ISD::FMUL, DL, VT, Rnd, N1);
17400+
MLA = DAG.getNode(ISD::FSUB, DL, VT, N0, Mul);
17401+
}
17402+
return NeedsCopySign ? DAG.getNode(ISD::FCOPYSIGN, DL, VT, MLA, N0) : MLA;
1739817403
}
1739917404

1740017405
return SDValue();

llvm/test/CodeGen/AArch64/frem-power2.ll

Lines changed: 85 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,22 @@
33
; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
44

55
define float @frem2(float %x) {
6-
; CHECK-LABEL: frem2:
7-
; CHECK: // %bb.0: // %entry
8-
; CHECK-NEXT: fmov s1, #2.00000000
9-
; CHECK-NEXT: b fmodf
6+
; CHECK-SD-LABEL: frem2:
7+
; CHECK-SD: // %bb.0: // %entry
8+
; CHECK-SD-NEXT: fmov s1, #2.00000000
9+
; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0
10+
; CHECK-SD-NEXT: fdiv s2, s0, s1
11+
; CHECK-SD-NEXT: frintz s2, s2
12+
; CHECK-SD-NEXT: fmsub s1, s2, s1, s0
13+
; CHECK-SD-NEXT: mvni v2.4s, #128, lsl #24
14+
; CHECK-SD-NEXT: bit v0.16b, v1.16b, v2.16b
15+
; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0
16+
; CHECK-SD-NEXT: ret
17+
;
18+
; CHECK-GI-LABEL: frem2:
19+
; CHECK-GI: // %bb.0: // %entry
20+
; CHECK-GI-NEXT: fmov s1, #2.00000000
21+
; CHECK-GI-NEXT: b fmodf
1022
entry:
1123
%fmod = frem float %x, 2.0
1224
ret float %fmod
@@ -311,6 +323,67 @@ entry:
311323
ret float %fmod
312324
}
313325

326+
define <4 x float> @frem2_vec(<4 x float> %x) {
327+
; CHECK-SD-LABEL: frem2_vec:
328+
; CHECK-SD: // %bb.0: // %entry
329+
; CHECK-SD-NEXT: movi v1.4s, #64, lsl #24
330+
; CHECK-SD-NEXT: mov v3.16b, v0.16b
331+
; CHECK-SD-NEXT: fdiv v2.4s, v0.4s, v1.4s
332+
; CHECK-SD-NEXT: frintz v2.4s, v2.4s
333+
; CHECK-SD-NEXT: fmls v3.4s, v1.4s, v2.4s
334+
; CHECK-SD-NEXT: mvni v1.4s, #128, lsl #24
335+
; CHECK-SD-NEXT: bit v0.16b, v3.16b, v1.16b
336+
; CHECK-SD-NEXT: ret
337+
;
338+
; CHECK-GI-LABEL: frem2_vec:
339+
; CHECK-GI: // %bb.0: // %entry
340+
; CHECK-GI-NEXT: sub sp, sp, #80
341+
; CHECK-GI-NEXT: str d10, [sp, #48] // 8-byte Folded Spill
342+
; CHECK-GI-NEXT: stp d9, d8, [sp, #56] // 16-byte Folded Spill
343+
; CHECK-GI-NEXT: str x30, [sp, #72] // 8-byte Folded Spill
344+
; CHECK-GI-NEXT: .cfi_def_cfa_offset 80
345+
; CHECK-GI-NEXT: .cfi_offset w30, -8
346+
; CHECK-GI-NEXT: .cfi_offset b8, -16
347+
; CHECK-GI-NEXT: .cfi_offset b9, -24
348+
; CHECK-GI-NEXT: .cfi_offset b10, -32
349+
; CHECK-GI-NEXT: fmov s1, #2.00000000
350+
; CHECK-GI-NEXT: mov s8, v0.s[1]
351+
; CHECK-GI-NEXT: mov s9, v0.s[2]
352+
; CHECK-GI-NEXT: mov s10, v0.s[3]
353+
; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0
354+
; CHECK-GI-NEXT: bl fmodf
355+
; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0
356+
; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill
357+
; CHECK-GI-NEXT: fmov s1, #2.00000000
358+
; CHECK-GI-NEXT: fmov s0, s8
359+
; CHECK-GI-NEXT: bl fmodf
360+
; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0
361+
; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
362+
; CHECK-GI-NEXT: fmov s1, #2.00000000
363+
; CHECK-GI-NEXT: fmov s0, s9
364+
; CHECK-GI-NEXT: bl fmodf
365+
; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0
366+
; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
367+
; CHECK-GI-NEXT: fmov s1, #2.00000000
368+
; CHECK-GI-NEXT: fmov s0, s10
369+
; CHECK-GI-NEXT: bl fmodf
370+
; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload
371+
; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0
372+
; CHECK-GI-NEXT: ldr x30, [sp, #72] // 8-byte Folded Reload
373+
; CHECK-GI-NEXT: ldp d9, d8, [sp, #56] // 16-byte Folded Reload
374+
; CHECK-GI-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload
375+
; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
376+
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
377+
; CHECK-GI-NEXT: mov v1.s[2], v2.s[0]
378+
; CHECK-GI-NEXT: mov v1.s[3], v0.s[0]
379+
; CHECK-GI-NEXT: mov v0.16b, v1.16b
380+
; CHECK-GI-NEXT: add sp, sp, #80
381+
; CHECK-GI-NEXT: ret
382+
entry:
383+
%fmod = frem <4 x float> %x, <float 2.0, float 2.0, float 2.0, float 2.0>
384+
ret <4 x float> %fmod
385+
}
386+
314387
define <4 x float> @frem2_nsz_vec(<4 x float> %x) {
315388
; CHECK-SD-LABEL: frem2_nsz_vec:
316389
; CHECK-SD: // %bb.0: // %entry
@@ -514,10 +587,15 @@ define float @frem2_constneg_sitofp(float %x, i32 %sa) {
514587
; CHECK-SD-LABEL: frem2_constneg_sitofp:
515588
; CHECK-SD: // %bb.0: // %entry
516589
; CHECK-SD-NEXT: mov w8, #1 // =0x1
517-
; CHECK-SD-NEXT: fmov s0, #-12.50000000
590+
; CHECK-SD-NEXT: fmov s1, #-12.50000000
518591
; CHECK-SD-NEXT: lsl w8, w8, w0
519-
; CHECK-SD-NEXT: scvtf s1, w8
520-
; CHECK-SD-NEXT: b fmodf
592+
; CHECK-SD-NEXT: scvtf s0, w8
593+
; CHECK-SD-NEXT: fdiv s2, s1, s0
594+
; CHECK-SD-NEXT: frintz s2, s2
595+
; CHECK-SD-NEXT: fmsub s0, s2, s0, s1
596+
; CHECK-SD-NEXT: fabs s0, s0
597+
; CHECK-SD-NEXT: fneg s0, s0
598+
; CHECK-SD-NEXT: ret
521599
;
522600
; CHECK-GI-LABEL: frem2_constneg_sitofp:
523601
; CHECK-GI: // %bb.0: // %entry

llvm/test/CodeGen/ARM/frem-power2.ll

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,29 @@ define float @frem4(float %x) {
1414
;
1515
; CHECK-FP-LABEL: frem4:
1616
; CHECK-FP: @ %bb.0: @ %entry
17-
; CHECK-FP-NEXT: mov.w r1, #1082130432
18-
; CHECK-FP-NEXT: b fmodf
17+
; CHECK-FP-NEXT: vmov.f32 s0, #4.000000e+00
18+
; CHECK-FP-NEXT: vmov s2, r0
19+
; CHECK-FP-NEXT: lsrs r0, r0, #31
20+
; CHECK-FP-NEXT: vdiv.f32 s4, s2, s0
21+
; CHECK-FP-NEXT: vrintz.f32 s4, s4
22+
; CHECK-FP-NEXT: vfms.f32 s2, s4, s0
23+
; CHECK-FP-NEXT: vmov r1, s2
24+
; CHECK-FP-NEXT: bfi r1, r0, #31, #1
25+
; CHECK-FP-NEXT: mov r0, r1
26+
; CHECK-FP-NEXT: bx lr
1927
;
2028
; CHECK-M33-LABEL: frem4:
2129
; CHECK-M33: @ %bb.0: @ %entry
22-
; CHECK-M33-NEXT: mov.w r1, #1082130432
23-
; CHECK-M33-NEXT: b fmodf
30+
; CHECK-M33-NEXT: vmov.f32 s0, #4.000000e+00
31+
; CHECK-M33-NEXT: vmov s2, r0
32+
; CHECK-M33-NEXT: lsrs r0, r0, #31
33+
; CHECK-M33-NEXT: vdiv.f32 s4, s2, s0
34+
; CHECK-M33-NEXT: vrintz.f32 s4, s4
35+
; CHECK-M33-NEXT: vmls.f32 s2, s4, s0
36+
; CHECK-M33-NEXT: vmov r1, s2
37+
; CHECK-M33-NEXT: bfi r1, r0, #31, #1
38+
; CHECK-M33-NEXT: mov r0, r1
39+
; CHECK-M33-NEXT: bx lr
2440
entry:
2541
%fmod = frem float %x, 4.0
2642
ret float %fmod

0 commit comments

Comments
 (0)