Skip to content

Commit f6643e3

Browse files
committed
[DAG] Lower frem of power-2 using div/trunk/mul+sub.
If we are lowering a frem and the divisor is known to me an integer power-2, we can use the formula 'frem = x - trunc(x / d) * d'. This avoids the more expensive call to fmod. The results are identical as fmod so long as d is a power-2 (so the mul does not round incorrectly), and the sign of the return is either always positive or not important (nsz). Unfortunately Alive2 does not handle this well at the moment. I was using exhaustive checking to test this, hopefully I didn't make a mistake in it (https://gist.github.com/davemgreen/6078015f30d3bacd1e9572f8db5d4b64). I found this in cpythons implementation of float_pow. I currently added it as a DAG combine for frem with power-2 fp constants.
1 parent 27b6f75 commit f6643e3

File tree

2 files changed

+158
-126
lines changed

2 files changed

+158
-126
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
#include "llvm/ADT/APFloat.h"
1919
#include "llvm/ADT/APInt.h"
20+
#include "llvm/ADT/APSInt.h"
2021
#include "llvm/ADT/ArrayRef.h"
2122
#include "llvm/ADT/DenseMap.h"
2223
#include "llvm/ADT/IntervalMap.h"
@@ -17261,6 +17262,25 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
1726117262
if (SDValue NewSel = foldBinOpIntoSelect(N))
1726217263
return NewSel;
1726317264

17265+
if (ConstantFPSDNode *C1 = isConstOrConstSplatFP(N1, true)) {
17266+
bool IsExact;
17267+
APSInt C1I(64, 0);
17268+
if (C1->getValueAPF().isInteger() && !C1->getValueAPF().isNegative() &&
17269+
C1->getValueAPF().convertToInteger(C1I, APFloat::rmTowardZero,
17270+
&IsExact) == APFloat::opOK &&
17271+
IsExact && isPowerOf2_64(C1I.getSExtValue()) &&
17272+
(Flags.hasNoSignedZeros() || N0.getOpcode() == ISD::FABS) &&
17273+
!TLI.isOperationLegal(ISD::FREM, VT) &&
17274+
TLI.isOperationLegalOrCustom(ISD::FMUL, VT) &&
17275+
TLI.isOperationLegalOrCustom(ISD::FDIV, VT) &&
17276+
TLI.isOperationLegalOrCustom(ISD::FTRUNC, VT)) {
17277+
SDValue Div = DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1);
17278+
SDValue Rnd = DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, Div);
17279+
SDValue Mul = DAG.getNode(ISD::FMUL, SDLoc(N), VT, Rnd, N1);
17280+
return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0, Mul);
17281+
}
17282+
}
17283+
1726417284
return SDValue();
1726517285
}
1726617286

llvm/test/CodeGen/AArch64/frem-power2.ll

Lines changed: 138 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -13,21 +13,40 @@ entry:
1313
}
1414

1515
define float @frem2_nsz(float %x) {
16-
; CHECK-LABEL: frem2_nsz:
17-
; CHECK: // %bb.0: // %entry
18-
; CHECK-NEXT: fmov s1, #2.00000000
19-
; CHECK-NEXT: b fmodf
16+
; CHECK-SD-LABEL: frem2_nsz:
17+
; CHECK-SD: // %bb.0: // %entry
18+
; CHECK-SD-NEXT: fmov s1, #2.00000000
19+
; CHECK-SD-NEXT: fdiv s1, s0, s1
20+
; CHECK-SD-NEXT: frintz s1, s1
21+
; CHECK-SD-NEXT: fadd s1, s1, s1
22+
; CHECK-SD-NEXT: fsub s0, s0, s1
23+
; CHECK-SD-NEXT: ret
24+
;
25+
; CHECK-GI-LABEL: frem2_nsz:
26+
; CHECK-GI: // %bb.0: // %entry
27+
; CHECK-GI-NEXT: fmov s1, #2.00000000
28+
; CHECK-GI-NEXT: b fmodf
2029
entry:
2130
%fmod = frem nsz float %x, 2.0
2231
ret float %fmod
2332
}
2433

2534
define float @frem2_abs(float %x) {
26-
; CHECK-LABEL: frem2_abs:
27-
; CHECK: // %bb.0: // %entry
28-
; CHECK-NEXT: fabs s0, s0
29-
; CHECK-NEXT: fmov s1, #2.00000000
30-
; CHECK-NEXT: b fmodf
35+
; CHECK-SD-LABEL: frem2_abs:
36+
; CHECK-SD: // %bb.0: // %entry
37+
; CHECK-SD-NEXT: fabs s0, s0
38+
; CHECK-SD-NEXT: fmov s1, #2.00000000
39+
; CHECK-SD-NEXT: fdiv s1, s0, s1
40+
; CHECK-SD-NEXT: frintz s1, s1
41+
; CHECK-SD-NEXT: fadd s1, s1, s1
42+
; CHECK-SD-NEXT: fsub s0, s0, s1
43+
; CHECK-SD-NEXT: ret
44+
;
45+
; CHECK-GI-LABEL: frem2_abs:
46+
; CHECK-GI: // %bb.0: // %entry
47+
; CHECK-GI-NEXT: fabs s0, s0
48+
; CHECK-GI-NEXT: fmov s1, #2.00000000
49+
; CHECK-GI-NEXT: b fmodf
3150
entry:
3251
%a = tail call float @llvm.fabs.f32(float %x)
3352
%fmod = frem float %a, 2.0
@@ -37,14 +56,11 @@ entry:
3756
define half @hrem2_nsz(half %x) {
3857
; CHECK-SD-LABEL: hrem2_nsz:
3958
; CHECK-SD: // %bb.0: // %entry
40-
; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
41-
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
42-
; CHECK-SD-NEXT: .cfi_offset w30, -16
43-
; CHECK-SD-NEXT: fcvt s0, h0
44-
; CHECK-SD-NEXT: fmov s1, #2.00000000
45-
; CHECK-SD-NEXT: bl fmodf
46-
; CHECK-SD-NEXT: fcvt h0, s0
47-
; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
59+
; CHECK-SD-NEXT: fmov h1, #2.00000000
60+
; CHECK-SD-NEXT: fdiv h1, h0, h1
61+
; CHECK-SD-NEXT: frintz h1, h1
62+
; CHECK-SD-NEXT: fadd h1, h1, h1
63+
; CHECK-SD-NEXT: fsub h0, h0, h1
4864
; CHECK-SD-NEXT: ret
4965
;
5066
; CHECK-GI-LABEL: hrem2_nsz:
@@ -65,10 +81,19 @@ entry:
6581
}
6682

6783
define double @drem2_nsz(double %x) {
68-
; CHECK-LABEL: drem2_nsz:
69-
; CHECK: // %bb.0: // %entry
70-
; CHECK-NEXT: fmov d1, #2.00000000
71-
; CHECK-NEXT: b fmod
84+
; CHECK-SD-LABEL: drem2_nsz:
85+
; CHECK-SD: // %bb.0: // %entry
86+
; CHECK-SD-NEXT: fmov d1, #2.00000000
87+
; CHECK-SD-NEXT: fdiv d1, d0, d1
88+
; CHECK-SD-NEXT: frintz d1, d1
89+
; CHECK-SD-NEXT: fadd d1, d1, d1
90+
; CHECK-SD-NEXT: fsub d0, d0, d1
91+
; CHECK-SD-NEXT: ret
92+
;
93+
; CHECK-GI-LABEL: drem2_nsz:
94+
; CHECK-GI: // %bb.0: // %entry
95+
; CHECK-GI-NEXT: fmov d1, #2.00000000
96+
; CHECK-GI-NEXT: b fmod
7297
entry:
7398
%fmod = frem nsz double %x, 2.0
7499
ret double %fmod
@@ -105,62 +130,115 @@ entry:
105130
}
106131

107132
define float @frem4_abs(float %x) {
108-
; CHECK-LABEL: frem4_abs:
109-
; CHECK: // %bb.0: // %entry
110-
; CHECK-NEXT: fabs s0, s0
111-
; CHECK-NEXT: fmov s1, #4.00000000
112-
; CHECK-NEXT: b fmodf
133+
; CHECK-SD-LABEL: frem4_abs:
134+
; CHECK-SD: // %bb.0: // %entry
135+
; CHECK-SD-NEXT: fabs s0, s0
136+
; CHECK-SD-NEXT: fmov s1, #4.00000000
137+
; CHECK-SD-NEXT: fdiv s2, s0, s1
138+
; CHECK-SD-NEXT: frintz s2, s2
139+
; CHECK-SD-NEXT: fmul s1, s2, s1
140+
; CHECK-SD-NEXT: fsub s0, s0, s1
141+
; CHECK-SD-NEXT: ret
142+
;
143+
; CHECK-GI-LABEL: frem4_abs:
144+
; CHECK-GI: // %bb.0: // %entry
145+
; CHECK-GI-NEXT: fabs s0, s0
146+
; CHECK-GI-NEXT: fmov s1, #4.00000000
147+
; CHECK-GI-NEXT: b fmodf
113148
entry:
114149
%a = tail call float @llvm.fabs.f32(float %x)
115150
%fmod = frem float %a, 4.0
116151
ret float %fmod
117152
}
118153

119154
define float @frem16_abs(float %x) {
120-
; CHECK-LABEL: frem16_abs:
121-
; CHECK: // %bb.0: // %entry
122-
; CHECK-NEXT: fabs s0, s0
123-
; CHECK-NEXT: fmov s1, #16.00000000
124-
; CHECK-NEXT: b fmodf
155+
; CHECK-SD-LABEL: frem16_abs:
156+
; CHECK-SD: // %bb.0: // %entry
157+
; CHECK-SD-NEXT: fabs s0, s0
158+
; CHECK-SD-NEXT: fmov s1, #16.00000000
159+
; CHECK-SD-NEXT: fdiv s2, s0, s1
160+
; CHECK-SD-NEXT: frintz s2, s2
161+
; CHECK-SD-NEXT: fmul s1, s2, s1
162+
; CHECK-SD-NEXT: fsub s0, s0, s1
163+
; CHECK-SD-NEXT: ret
164+
;
165+
; CHECK-GI-LABEL: frem16_abs:
166+
; CHECK-GI: // %bb.0: // %entry
167+
; CHECK-GI-NEXT: fabs s0, s0
168+
; CHECK-GI-NEXT: fmov s1, #16.00000000
169+
; CHECK-GI-NEXT: b fmodf
125170
entry:
126171
%a = tail call float @llvm.fabs.f32(float %x)
127172
%fmod = frem float %a, 16.0
128173
ret float %fmod
129174
}
130175

131176
define float @frem4294967296_abs(float %x) {
132-
; CHECK-LABEL: frem4294967296_abs:
133-
; CHECK: // %bb.0: // %entry
134-
; CHECK-NEXT: fabs s0, s0
135-
; CHECK-NEXT: mov w8, #1333788672 // =0x4f800000
136-
; CHECK-NEXT: fmov s1, w8
137-
; CHECK-NEXT: b fmodf
177+
; CHECK-SD-LABEL: frem4294967296_abs:
178+
; CHECK-SD: // %bb.0: // %entry
179+
; CHECK-SD-NEXT: fabs s0, s0
180+
; CHECK-SD-NEXT: mov w8, #1333788672 // =0x4f800000
181+
; CHECK-SD-NEXT: fmov s1, w8
182+
; CHECK-SD-NEXT: fdiv s2, s0, s1
183+
; CHECK-SD-NEXT: frintz s2, s2
184+
; CHECK-SD-NEXT: fmul s1, s2, s1
185+
; CHECK-SD-NEXT: fsub s0, s0, s1
186+
; CHECK-SD-NEXT: ret
187+
;
188+
; CHECK-GI-LABEL: frem4294967296_abs:
189+
; CHECK-GI: // %bb.0: // %entry
190+
; CHECK-GI-NEXT: fabs s0, s0
191+
; CHECK-GI-NEXT: mov w8, #1333788672 // =0x4f800000
192+
; CHECK-GI-NEXT: fmov s1, w8
193+
; CHECK-GI-NEXT: b fmodf
138194
entry:
139195
%a = tail call float @llvm.fabs.f32(float %x)
140196
%fmod = frem float %a, 4294967296.0
141197
ret float %fmod
142198
}
143199

144200
define float @frem1152921504606846976_abs(float %x) {
145-
; CHECK-LABEL: frem1152921504606846976_abs:
146-
; CHECK: // %bb.0: // %entry
147-
; CHECK-NEXT: fabs s0, s0
148-
; CHECK-NEXT: mov w8, #1568669696 // =0x5d800000
149-
; CHECK-NEXT: fmov s1, w8
150-
; CHECK-NEXT: b fmodf
201+
; CHECK-SD-LABEL: frem1152921504606846976_abs:
202+
; CHECK-SD: // %bb.0: // %entry
203+
; CHECK-SD-NEXT: fabs s0, s0
204+
; CHECK-SD-NEXT: mov w8, #1568669696 // =0x5d800000
205+
; CHECK-SD-NEXT: fmov s1, w8
206+
; CHECK-SD-NEXT: fdiv s2, s0, s1
207+
; CHECK-SD-NEXT: frintz s2, s2
208+
; CHECK-SD-NEXT: fmul s1, s2, s1
209+
; CHECK-SD-NEXT: fsub s0, s0, s1
210+
; CHECK-SD-NEXT: ret
211+
;
212+
; CHECK-GI-LABEL: frem1152921504606846976_abs:
213+
; CHECK-GI: // %bb.0: // %entry
214+
; CHECK-GI-NEXT: fabs s0, s0
215+
; CHECK-GI-NEXT: mov w8, #1568669696 // =0x5d800000
216+
; CHECK-GI-NEXT: fmov s1, w8
217+
; CHECK-GI-NEXT: b fmodf
151218
entry:
152219
%a = tail call float @llvm.fabs.f32(float %x)
153220
%fmod = frem float %a, 1152921504606846976.0
154221
ret float %fmod
155222
}
156223

157224
define float @frem4611686018427387904_abs(float %x) {
158-
; CHECK-LABEL: frem4611686018427387904_abs:
159-
; CHECK: // %bb.0: // %entry
160-
; CHECK-NEXT: fabs s0, s0
161-
; CHECK-NEXT: mov w8, #1585446912 // =0x5e800000
162-
; CHECK-NEXT: fmov s1, w8
163-
; CHECK-NEXT: b fmodf
225+
; CHECK-SD-LABEL: frem4611686018427387904_abs:
226+
; CHECK-SD: // %bb.0: // %entry
227+
; CHECK-SD-NEXT: fabs s0, s0
228+
; CHECK-SD-NEXT: mov w8, #1585446912 // =0x5e800000
229+
; CHECK-SD-NEXT: fmov s1, w8
230+
; CHECK-SD-NEXT: fdiv s2, s0, s1
231+
; CHECK-SD-NEXT: frintz s2, s2
232+
; CHECK-SD-NEXT: fmul s1, s2, s1
233+
; CHECK-SD-NEXT: fsub s0, s0, s1
234+
; CHECK-SD-NEXT: ret
235+
;
236+
; CHECK-GI-LABEL: frem4611686018427387904_abs:
237+
; CHECK-GI: // %bb.0: // %entry
238+
; CHECK-GI-NEXT: fabs s0, s0
239+
; CHECK-GI-NEXT: mov w8, #1585446912 // =0x5e800000
240+
; CHECK-GI-NEXT: fmov s1, w8
241+
; CHECK-GI-NEXT: b fmodf
164242
entry:
165243
%a = tail call float @llvm.fabs.f32(float %x)
166244
%fmod = frem float %a, 4611686018427387904.0
@@ -182,42 +260,11 @@ entry:
182260
define <4 x float> @frem2_nsz_vec(<4 x float> %x) {
183261
; CHECK-SD-LABEL: frem2_nsz_vec:
184262
; CHECK-SD: // %bb.0: // %entry
185-
; CHECK-SD-NEXT: sub sp, sp, #48
186-
; CHECK-SD-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
187-
; CHECK-SD-NEXT: .cfi_def_cfa_offset 48
188-
; CHECK-SD-NEXT: .cfi_offset w30, -16
189-
; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
190-
; CHECK-SD-NEXT: mov s0, v0.s[1]
191-
; CHECK-SD-NEXT: fmov s1, #2.00000000
192-
; CHECK-SD-NEXT: bl fmodf
193-
; CHECK-SD-NEXT: fmov s1, #2.00000000
194-
; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0
195-
; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill
196-
; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
197-
; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0
198-
; CHECK-SD-NEXT: bl fmodf
199-
; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload
200-
; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0
201-
; CHECK-SD-NEXT: mov v0.s[1], v1.s[0]
202-
; CHECK-SD-NEXT: fmov s1, #2.00000000
203-
; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill
204-
; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
205-
; CHECK-SD-NEXT: mov s0, v0.s[2]
206-
; CHECK-SD-NEXT: bl fmodf
207-
; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload
208-
; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0
209-
; CHECK-SD-NEXT: mov v1.s[2], v0.s[0]
210-
; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
211-
; CHECK-SD-NEXT: mov s0, v0.s[3]
212-
; CHECK-SD-NEXT: str q1, [sp] // 16-byte Folded Spill
213-
; CHECK-SD-NEXT: fmov s1, #2.00000000
214-
; CHECK-SD-NEXT: bl fmodf
215-
; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload
216-
; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0
217-
; CHECK-SD-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
218-
; CHECK-SD-NEXT: mov v1.s[3], v0.s[0]
219-
; CHECK-SD-NEXT: mov v0.16b, v1.16b
220-
; CHECK-SD-NEXT: add sp, sp, #48
263+
; CHECK-SD-NEXT: movi v1.4s, #64, lsl #24
264+
; CHECK-SD-NEXT: fdiv v1.4s, v0.4s, v1.4s
265+
; CHECK-SD-NEXT: frintz v1.4s, v1.4s
266+
; CHECK-SD-NEXT: fadd v1.4s, v1.4s, v1.4s
267+
; CHECK-SD-NEXT: fsub v0.4s, v0.4s, v1.4s
221268
; CHECK-SD-NEXT: ret
222269
;
223270
; CHECK-GI-LABEL: frem2_nsz_vec:
@@ -272,48 +319,13 @@ entry:
272319
define <4 x float> @frem1152921504606846976_absv(<4 x float> %x) {
273320
; CHECK-SD-LABEL: frem1152921504606846976_absv:
274321
; CHECK-SD: // %bb.0: // %entry
275-
; CHECK-SD-NEXT: sub sp, sp, #48
276-
; CHECK-SD-NEXT: str d8, [sp, #32] // 8-byte Folded Spill
277-
; CHECK-SD-NEXT: str x30, [sp, #40] // 8-byte Folded Spill
278-
; CHECK-SD-NEXT: .cfi_def_cfa_offset 48
279-
; CHECK-SD-NEXT: .cfi_offset w30, -8
280-
; CHECK-SD-NEXT: .cfi_offset b8, -16
281-
; CHECK-SD-NEXT: fabs v0.4s, v0.4s
282322
; CHECK-SD-NEXT: mov w8, #1568669696 // =0x5d800000
283-
; CHECK-SD-NEXT: fmov s8, w8
284-
; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
285-
; CHECK-SD-NEXT: mov s0, v0.s[1]
286-
; CHECK-SD-NEXT: fmov s1, s8
287-
; CHECK-SD-NEXT: bl fmodf
288-
; CHECK-SD-NEXT: fmov s1, s8
289-
; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0
290-
; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill
291-
; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
292-
; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0
293-
; CHECK-SD-NEXT: bl fmodf
294-
; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload
295-
; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0
296-
; CHECK-SD-NEXT: mov v0.s[1], v1.s[0]
297-
; CHECK-SD-NEXT: fmov s1, s8
298-
; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill
299-
; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
300-
; CHECK-SD-NEXT: mov s0, v0.s[2]
301-
; CHECK-SD-NEXT: bl fmodf
302-
; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload
303-
; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0
304-
; CHECK-SD-NEXT: mov v1.s[2], v0.s[0]
305-
; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
306-
; CHECK-SD-NEXT: mov s0, v0.s[3]
307-
; CHECK-SD-NEXT: str q1, [sp] // 16-byte Folded Spill
308-
; CHECK-SD-NEXT: fmov s1, s8
309-
; CHECK-SD-NEXT: bl fmodf
310-
; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload
311-
; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0
312-
; CHECK-SD-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload
313-
; CHECK-SD-NEXT: ldr d8, [sp, #32] // 8-byte Folded Reload
314-
; CHECK-SD-NEXT: mov v1.s[3], v0.s[0]
315-
; CHECK-SD-NEXT: mov v0.16b, v1.16b
316-
; CHECK-SD-NEXT: add sp, sp, #48
323+
; CHECK-SD-NEXT: fabs v0.4s, v0.4s
324+
; CHECK-SD-NEXT: dup v1.4s, w8
325+
; CHECK-SD-NEXT: fdiv v2.4s, v0.4s, v1.4s
326+
; CHECK-SD-NEXT: frintz v2.4s, v2.4s
327+
; CHECK-SD-NEXT: fmul v1.4s, v2.4s, v1.4s
328+
; CHECK-SD-NEXT: fsub v0.4s, v0.4s, v1.4s
317329
; CHECK-SD-NEXT: ret
318330
;
319331
; CHECK-GI-LABEL: frem1152921504606846976_absv:

0 commit comments

Comments
 (0)