Skip to content

Commit 576be7b

Browse files
authored
[AArch64][SVE] Generate asrd instruction for positive pow-2 divisors … (llvm#137151)
…when SVE is available Currently, ``` sdiv(x, y) --> cmlt + usra + sshr , where y is positive pow-2 integer sdiv(x, y) --> cmlt + usra + sshr + neg , where y is negative pow-2 integer ``` Patch aims to transform this into ``` sdiv(x, y) --> ptrue + asrd , where y is positive pow-2 integer sdiv(x, y) --> ptrue + asrd + subr , where y is negative pow-2 integer ```
1 parent a28f657 commit 576be7b

File tree

2 files changed

+46
-70
lines changed

2 files changed

+46
-70
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18421,10 +18421,10 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
1842118421

1842218422
EVT VT = N->getValueType(0);
1842318423

18424-
// For scalable and fixed types, mark them as cheap so we can handle it much
18425-
// later. This allows us to handle larger than legal types.
18426-
if (VT.isScalableVector() ||
18427-
(VT.isFixedLengthVector() && Subtarget->useSVEForFixedLengthVectors()))
18424+
// If SVE is available, we can generate
18425+
// sdiv(x,y) -> ptrue + asrd , where 'y' is positive pow-2 divisor.
18426+
// sdiv(x,y) -> ptrue + asrd + subr , where 'y' is negative pow-2 divisor.
18427+
if (VT.isVector() && Subtarget->isSVEorStreamingSVEAvailable())
1842818428
return SDValue(N, 0);
1842918429

1843018430
// fold (sdiv X, pow2)

llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll

Lines changed: 42 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,11 @@ target triple = "aarch64-unknown-linux-gnu"
99
define <4 x i32> @sdiv_v4i32_negative_pow2_divisor_packed(<4 x i32> %op1) vscale_range(1,0) #0 {
1010
; CHECK-LABEL: sdiv_v4i32_negative_pow2_divisor_packed:
1111
; CHECK: // %bb.0:
12-
; CHECK-NEXT: cmlt v1.4s, v0.4s, #0
13-
; CHECK-NEXT: usra v0.4s, v1.4s, #29
14-
; CHECK-NEXT: sshr v0.4s, v0.4s, #3
15-
; CHECK-NEXT: neg v0.4s, v0.4s
12+
; CHECK-NEXT: ptrue p0.s, vl4
13+
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
14+
; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #3
15+
; CHECK-NEXT: subr z0.s, z0.s, #0 // =0x0
16+
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
1617
; CHECK-NEXT: ret
1718
%res = sdiv <4 x i32> %op1, splat (i32 -8)
1819
ret <4 x i32> %res
@@ -21,10 +22,11 @@ define <4 x i32> @sdiv_v4i32_negative_pow2_divisor_packed(<4 x i32> %op1) vscale
2122
define <2 x i32> @sdiv_v2i32_negative_pow2_divisor_unpacked(<2 x i32> %op1) vscale_range(1,0) #0 {
2223
; CHECK-LABEL: sdiv_v2i32_negative_pow2_divisor_unpacked:
2324
; CHECK: // %bb.0:
24-
; CHECK-NEXT: cmlt v1.2s, v0.2s, #0
25-
; CHECK-NEXT: usra v0.2s, v1.2s, #29
26-
; CHECK-NEXT: sshr v0.2s, v0.2s, #3
27-
; CHECK-NEXT: neg v0.2s, v0.2s
25+
; CHECK-NEXT: ptrue p0.s, vl2
26+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
27+
; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #3
28+
; CHECK-NEXT: subr z0.s, z0.s, #0 // =0x0
29+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
2830
; CHECK-NEXT: ret
2931
%res = sdiv <2 x i32> %op1, splat (i32 -8)
3032
ret <2 x i32> %res
@@ -33,9 +35,10 @@ define <2 x i32> @sdiv_v2i32_negative_pow2_divisor_unpacked(<2 x i32> %op1) vsca
3335
define <4 x i32> @sdiv_v4i32_positive_pow2_divisor_packed(<4 x i32> %op1) vscale_range(1,0) #0 {
3436
; CHECK-LABEL: sdiv_v4i32_positive_pow2_divisor_packed:
3537
; CHECK: // %bb.0:
36-
; CHECK-NEXT: cmlt v1.4s, v0.4s, #0
37-
; CHECK-NEXT: usra v0.4s, v1.4s, #29
38-
; CHECK-NEXT: sshr v0.4s, v0.4s, #3
38+
; CHECK-NEXT: ptrue p0.s, vl4
39+
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
40+
; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #3
41+
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
3942
; CHECK-NEXT: ret
4043
%res = sdiv <4 x i32> %op1, splat (i32 8)
4144
ret <4 x i32> %res
@@ -44,9 +47,10 @@ define <4 x i32> @sdiv_v4i32_positive_pow2_divisor_packed(<4 x i32> %op1) vscale
4447
define <2 x i32> @sdiv_v2i32_positive_pow2_divisor_unpacked(<2 x i32> %op1) vscale_range(1,0) #0 {
4548
; CHECK-LABEL: sdiv_v2i32_positive_pow2_divisor_unpacked:
4649
; CHECK: // %bb.0:
47-
; CHECK-NEXT: cmlt v1.2s, v0.2s, #0
48-
; CHECK-NEXT: usra v0.2s, v1.2s, #29
49-
; CHECK-NEXT: sshr v0.2s, v0.2s, #3
50+
; CHECK-NEXT: ptrue p0.s, vl2
51+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
52+
; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #3
53+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
5054
; CHECK-NEXT: ret
5155
%res = sdiv <2 x i32> %op1, splat (i32 8)
5256
ret <2 x i32> %res
@@ -95,19 +99,12 @@ define void @sdiv_v64i8(ptr %a) #0 {
9599
; VBITS_GE_128-LABEL: sdiv_v64i8:
96100
; VBITS_GE_128: // %bb.0:
97101
; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
98-
; VBITS_GE_128-NEXT: ldp q3, q4, [x0]
99-
; VBITS_GE_128-NEXT: cmlt v2.16b, v0.16b, #0
100-
; VBITS_GE_128-NEXT: cmlt v5.16b, v1.16b, #0
101-
; VBITS_GE_128-NEXT: cmlt v6.16b, v3.16b, #0
102-
; VBITS_GE_128-NEXT: usra v0.16b, v2.16b, #3
103-
; VBITS_GE_128-NEXT: cmlt v2.16b, v4.16b, #0
104-
; VBITS_GE_128-NEXT: usra v1.16b, v5.16b, #3
105-
; VBITS_GE_128-NEXT: usra v3.16b, v6.16b, #3
106-
; VBITS_GE_128-NEXT: usra v4.16b, v2.16b, #3
107-
; VBITS_GE_128-NEXT: sshr v0.16b, v0.16b, #5
108-
; VBITS_GE_128-NEXT: sshr v1.16b, v1.16b, #5
109-
; VBITS_GE_128-NEXT: sshr v2.16b, v3.16b, #5
110-
; VBITS_GE_128-NEXT: sshr v3.16b, v4.16b, #5
102+
; VBITS_GE_128-NEXT: ptrue p0.b, vl16
103+
; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
104+
; VBITS_GE_128-NEXT: asrd z0.b, p0/m, z0.b, #5
105+
; VBITS_GE_128-NEXT: asrd z1.b, p0/m, z1.b, #5
106+
; VBITS_GE_128-NEXT: asrd z2.b, p0/m, z2.b, #5
107+
; VBITS_GE_128-NEXT: asrd z3.b, p0/m, z3.b, #5
111108
; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
112109
; VBITS_GE_128-NEXT: stp q2, q3, [x0]
113110
; VBITS_GE_128-NEXT: ret
@@ -209,19 +206,12 @@ define void @sdiv_v32i16(ptr %a) #0 {
209206
; VBITS_GE_128-LABEL: sdiv_v32i16:
210207
; VBITS_GE_128: // %bb.0:
211208
; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
212-
; VBITS_GE_128-NEXT: ldp q3, q4, [x0]
213-
; VBITS_GE_128-NEXT: cmlt v2.8h, v0.8h, #0
214-
; VBITS_GE_128-NEXT: cmlt v5.8h, v1.8h, #0
215-
; VBITS_GE_128-NEXT: cmlt v6.8h, v3.8h, #0
216-
; VBITS_GE_128-NEXT: usra v0.8h, v2.8h, #11
217-
; VBITS_GE_128-NEXT: cmlt v2.8h, v4.8h, #0
218-
; VBITS_GE_128-NEXT: usra v1.8h, v5.8h, #11
219-
; VBITS_GE_128-NEXT: usra v3.8h, v6.8h, #11
220-
; VBITS_GE_128-NEXT: usra v4.8h, v2.8h, #11
221-
; VBITS_GE_128-NEXT: sshr v0.8h, v0.8h, #5
222-
; VBITS_GE_128-NEXT: sshr v1.8h, v1.8h, #5
223-
; VBITS_GE_128-NEXT: sshr v2.8h, v3.8h, #5
224-
; VBITS_GE_128-NEXT: sshr v3.8h, v4.8h, #5
209+
; VBITS_GE_128-NEXT: ptrue p0.h, vl8
210+
; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
211+
; VBITS_GE_128-NEXT: asrd z0.h, p0/m, z0.h, #5
212+
; VBITS_GE_128-NEXT: asrd z1.h, p0/m, z1.h, #5
213+
; VBITS_GE_128-NEXT: asrd z2.h, p0/m, z2.h, #5
214+
; VBITS_GE_128-NEXT: asrd z3.h, p0/m, z3.h, #5
225215
; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
226216
; VBITS_GE_128-NEXT: stp q2, q3, [x0]
227217
; VBITS_GE_128-NEXT: ret
@@ -324,19 +314,12 @@ define void @sdiv_v16i32(ptr %a) #0 {
324314
; VBITS_GE_128-LABEL: sdiv_v16i32:
325315
; VBITS_GE_128: // %bb.0:
326316
; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
327-
; VBITS_GE_128-NEXT: ldp q3, q4, [x0]
328-
; VBITS_GE_128-NEXT: cmlt v2.4s, v0.4s, #0
329-
; VBITS_GE_128-NEXT: cmlt v5.4s, v1.4s, #0
330-
; VBITS_GE_128-NEXT: cmlt v6.4s, v3.4s, #0
331-
; VBITS_GE_128-NEXT: usra v0.4s, v2.4s, #27
332-
; VBITS_GE_128-NEXT: cmlt v2.4s, v4.4s, #0
333-
; VBITS_GE_128-NEXT: usra v1.4s, v5.4s, #27
334-
; VBITS_GE_128-NEXT: usra v3.4s, v6.4s, #27
335-
; VBITS_GE_128-NEXT: usra v4.4s, v2.4s, #27
336-
; VBITS_GE_128-NEXT: sshr v0.4s, v0.4s, #5
337-
; VBITS_GE_128-NEXT: sshr v1.4s, v1.4s, #5
338-
; VBITS_GE_128-NEXT: sshr v2.4s, v3.4s, #5
339-
; VBITS_GE_128-NEXT: sshr v3.4s, v4.4s, #5
317+
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
318+
; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
319+
; VBITS_GE_128-NEXT: asrd z0.s, p0/m, z0.s, #5
320+
; VBITS_GE_128-NEXT: asrd z1.s, p0/m, z1.s, #5
321+
; VBITS_GE_128-NEXT: asrd z2.s, p0/m, z2.s, #5
322+
; VBITS_GE_128-NEXT: asrd z3.s, p0/m, z3.s, #5
340323
; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
341324
; VBITS_GE_128-NEXT: stp q2, q3, [x0]
342325
; VBITS_GE_128-NEXT: ret
@@ -439,19 +422,12 @@ define void @sdiv_v8i64(ptr %a) #0 {
439422
; VBITS_GE_128-LABEL: sdiv_v8i64:
440423
; VBITS_GE_128: // %bb.0:
441424
; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
442-
; VBITS_GE_128-NEXT: ldp q3, q4, [x0]
443-
; VBITS_GE_128-NEXT: cmlt v2.2d, v0.2d, #0
444-
; VBITS_GE_128-NEXT: cmlt v5.2d, v1.2d, #0
445-
; VBITS_GE_128-NEXT: cmlt v6.2d, v3.2d, #0
446-
; VBITS_GE_128-NEXT: usra v0.2d, v2.2d, #59
447-
; VBITS_GE_128-NEXT: cmlt v2.2d, v4.2d, #0
448-
; VBITS_GE_128-NEXT: usra v1.2d, v5.2d, #59
449-
; VBITS_GE_128-NEXT: usra v3.2d, v6.2d, #59
450-
; VBITS_GE_128-NEXT: usra v4.2d, v2.2d, #59
451-
; VBITS_GE_128-NEXT: sshr v0.2d, v0.2d, #5
452-
; VBITS_GE_128-NEXT: sshr v1.2d, v1.2d, #5
453-
; VBITS_GE_128-NEXT: sshr v2.2d, v3.2d, #5
454-
; VBITS_GE_128-NEXT: sshr v3.2d, v4.2d, #5
425+
; VBITS_GE_128-NEXT: ptrue p0.d, vl2
426+
; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
427+
; VBITS_GE_128-NEXT: asrd z0.d, p0/m, z0.d, #5
428+
; VBITS_GE_128-NEXT: asrd z1.d, p0/m, z1.d, #5
429+
; VBITS_GE_128-NEXT: asrd z2.d, p0/m, z2.d, #5
430+
; VBITS_GE_128-NEXT: asrd z3.d, p0/m, z3.d, #5
455431
; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
456432
; VBITS_GE_128-NEXT: stp q2, q3, [x0]
457433
; VBITS_GE_128-NEXT: ret

0 commit comments

Comments
 (0)