Skip to content

Commit dc7b743

Browse files
davemgreentstellar
authored andcommitted
[AArch64] Fix SVE scalar fcopysign lowering without neon. (llvm#129787)
Without this we can try to generate invalid instructions or create illegal types. This patch generates a SVE fcopysign instead and use its lowering. BF16 is left out of the moment as it doesn't lower successfully (but could use the same code as fp16). (cherry picked from commit d4ab3df)
1 parent fcd0ad2 commit dc7b743

File tree

2 files changed

+74
-84
lines changed

2 files changed

+74
-84
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10683,6 +10683,25 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
1068310683
return convertFromScalableVector(DAG, VT, Res);
1068410684
}
1068510685

10686+
// With SVE, but without Neon, extend the scalars to scalable vectors and use
10687+
// a SVE FCOPYSIGN.
10688+
if (!VT.isVector() && !Subtarget->isNeonAvailable() &&
10689+
Subtarget->isSVEorStreamingSVEAvailable()) {
10690+
if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64)
10691+
return SDValue();
10692+
EVT SVT = getPackedSVEVectorVT(VT);
10693+
10694+
SDValue Ins1 =
10695+
DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In1,
10696+
DAG.getConstant(0, DL, MVT::i64));
10697+
SDValue Ins2 =
10698+
DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In2,
10699+
DAG.getConstant(0, DL, MVT::i64));
10700+
SDValue FCS = DAG.getNode(ISD::FCOPYSIGN, DL, SVT, Ins1, Ins2);
10701+
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, FCS,
10702+
DAG.getConstant(0, DL, MVT::i64));
10703+
}
10704+
1068610705
auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
1068710706
if (VT.isScalableVector())
1068810707
return getSVESafeBitCast(VT, Op, DAG);

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll

Lines changed: 55 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -11,32 +11,21 @@ target triple = "aarch64-unknown-linux-gnu"
1111
define void @test_copysign_f16(ptr %ap, ptr %bp) {
1212
; SVE-LABEL: test_copysign_f16:
1313
; SVE: // %bb.0:
14-
; SVE-NEXT: adrp x8, .LCPI0_0
14+
; SVE-NEXT: ldr h0, [x1]
1515
; SVE-NEXT: ldr h1, [x0]
16-
; SVE-NEXT: ldr h2, [x1]
17-
; SVE-NEXT: ldr q0, [x8, :lo12:.LCPI0_0]
18-
; SVE-NEXT: adrp x8, .LCPI0_1
19-
; SVE-NEXT: ldr q4, [x8, :lo12:.LCPI0_1]
20-
; SVE-NEXT: mov z3.d, z0.d
21-
; SVE-NEXT: fmov s0, s1
22-
; SVE-NEXT: fmov s3, s2
23-
; SVE-NEXT: bif v0.16b, v3.16b, v4.16b
16+
; SVE-NEXT: and z0.h, z0.h, #0x8000
17+
; SVE-NEXT: and z1.h, z1.h, #0x7fff
18+
; SVE-NEXT: orr z0.d, z1.d, z0.d
2419
; SVE-NEXT: str h0, [x0]
2520
; SVE-NEXT: ret
2621
;
2722
; SVE2-LABEL: test_copysign_f16:
2823
; SVE2: // %bb.0:
29-
; SVE2-NEXT: adrp x8, .LCPI0_0
30-
; SVE2-NEXT: ldr h1, [x0]
31-
; SVE2-NEXT: ldr h2, [x1]
32-
; SVE2-NEXT: ldr q0, [x8, :lo12:.LCPI0_0]
33-
; SVE2-NEXT: adrp x8, .LCPI0_1
34-
; SVE2-NEXT: ldr q4, [x8, :lo12:.LCPI0_1]
35-
; SVE2-NEXT: mov z3.d, z0.d
36-
; SVE2-NEXT: fmov s0, s1
37-
; SVE2-NEXT: fmov s3, s2
38-
; SVE2-NEXT: bif v0.16b, v3.16b, v4.16b
39-
; SVE2-NEXT: str h0, [x0]
24+
; SVE2-NEXT: mov z0.h, #32767 // =0x7fff
25+
; SVE2-NEXT: ldr h1, [x1]
26+
; SVE2-NEXT: ldr h2, [x0]
27+
; SVE2-NEXT: bsl z2.d, z2.d, z1.d, z0.d
28+
; SVE2-NEXT: str h2, [x0]
4029
; SVE2-NEXT: ret
4130
;
4231
; NONEON-NOSVE-LABEL: test_copysign_f16:
@@ -66,32 +55,40 @@ define void @test_copysign_f16(ptr %ap, ptr %bp) {
6655
define void @test_copysign_bf16(ptr %ap, ptr %bp) {
6756
; SVE-LABEL: test_copysign_bf16:
6857
; SVE: // %bb.0:
69-
; SVE-NEXT: adrp x8, .LCPI1_0
70-
; SVE-NEXT: ldr h1, [x0]
71-
; SVE-NEXT: ldr h2, [x1]
72-
; SVE-NEXT: ldr q0, [x8, :lo12:.LCPI1_0]
73-
; SVE-NEXT: adrp x8, .LCPI1_1
74-
; SVE-NEXT: ldr q4, [x8, :lo12:.LCPI1_1]
75-
; SVE-NEXT: mov z3.d, z0.d
76-
; SVE-NEXT: fmov s0, s1
77-
; SVE-NEXT: fmov s3, s2
78-
; SVE-NEXT: bif v0.16b, v3.16b, v4.16b
58+
; SVE-NEXT: sub sp, sp, #16
59+
; SVE-NEXT: .cfi_def_cfa_offset 16
60+
; SVE-NEXT: ldr h0, [x0]
61+
; SVE-NEXT: ldr h1, [x1]
62+
; SVE-NEXT: fmov w8, s0
63+
; SVE-NEXT: str h1, [sp, #12]
64+
; SVE-NEXT: ldrb w9, [sp, #13]
65+
; SVE-NEXT: and w8, w8, #0x7fff
66+
; SVE-NEXT: tst w9, #0x80
67+
; SVE-NEXT: fmov s0, w8
68+
; SVE-NEXT: eor w8, w8, #0x8000
69+
; SVE-NEXT: fmov s1, w8
70+
; SVE-NEXT: fcsel h0, h1, h0, ne
7971
; SVE-NEXT: str h0, [x0]
72+
; SVE-NEXT: add sp, sp, #16
8073
; SVE-NEXT: ret
8174
;
8275
; SVE2-LABEL: test_copysign_bf16:
8376
; SVE2: // %bb.0:
84-
; SVE2-NEXT: adrp x8, .LCPI1_0
85-
; SVE2-NEXT: ldr h1, [x0]
86-
; SVE2-NEXT: ldr h2, [x1]
87-
; SVE2-NEXT: ldr q0, [x8, :lo12:.LCPI1_0]
88-
; SVE2-NEXT: adrp x8, .LCPI1_1
89-
; SVE2-NEXT: ldr q4, [x8, :lo12:.LCPI1_1]
90-
; SVE2-NEXT: mov z3.d, z0.d
91-
; SVE2-NEXT: fmov s0, s1
92-
; SVE2-NEXT: fmov s3, s2
93-
; SVE2-NEXT: bif v0.16b, v3.16b, v4.16b
77+
; SVE2-NEXT: sub sp, sp, #16
78+
; SVE2-NEXT: .cfi_def_cfa_offset 16
79+
; SVE2-NEXT: ldr h0, [x0]
80+
; SVE2-NEXT: ldr h1, [x1]
81+
; SVE2-NEXT: fmov w8, s0
82+
; SVE2-NEXT: str h1, [sp, #12]
83+
; SVE2-NEXT: ldrb w9, [sp, #13]
84+
; SVE2-NEXT: and w8, w8, #0x7fff
85+
; SVE2-NEXT: tst w9, #0x80
86+
; SVE2-NEXT: fmov s0, w8
87+
; SVE2-NEXT: eor w8, w8, #0x8000
88+
; SVE2-NEXT: fmov s1, w8
89+
; SVE2-NEXT: fcsel h0, h1, h0, ne
9490
; SVE2-NEXT: str h0, [x0]
91+
; SVE2-NEXT: add sp, sp, #16
9592
; SVE2-NEXT: ret
9693
;
9794
; NONEON-NOSVE-LABEL: test_copysign_bf16:
@@ -139,32 +136,21 @@ define void @test_copysign_bf16(ptr %ap, ptr %bp) {
139136
define void @test_copysign_f32(ptr %ap, ptr %bp) {
140137
; SVE-LABEL: test_copysign_f32:
141138
; SVE: // %bb.0:
142-
; SVE-NEXT: adrp x8, .LCPI2_0
139+
; SVE-NEXT: ldr s0, [x1]
143140
; SVE-NEXT: ldr s1, [x0]
144-
; SVE-NEXT: ldr s2, [x1]
145-
; SVE-NEXT: ldr q0, [x8, :lo12:.LCPI2_0]
146-
; SVE-NEXT: adrp x8, .LCPI2_1
147-
; SVE-NEXT: ldr q4, [x8, :lo12:.LCPI2_1]
148-
; SVE-NEXT: mov z3.d, z0.d
149-
; SVE-NEXT: fmov s0, s1
150-
; SVE-NEXT: fmov s3, s2
151-
; SVE-NEXT: bif v0.16b, v3.16b, v4.16b
141+
; SVE-NEXT: and z0.s, z0.s, #0x80000000
142+
; SVE-NEXT: and z1.s, z1.s, #0x7fffffff
143+
; SVE-NEXT: orr z0.d, z1.d, z0.d
152144
; SVE-NEXT: str s0, [x0]
153145
; SVE-NEXT: ret
154146
;
155147
; SVE2-LABEL: test_copysign_f32:
156148
; SVE2: // %bb.0:
157-
; SVE2-NEXT: adrp x8, .LCPI2_0
158-
; SVE2-NEXT: ldr s1, [x0]
159-
; SVE2-NEXT: ldr s2, [x1]
160-
; SVE2-NEXT: ldr q0, [x8, :lo12:.LCPI2_0]
161-
; SVE2-NEXT: adrp x8, .LCPI2_1
162-
; SVE2-NEXT: ldr q4, [x8, :lo12:.LCPI2_1]
163-
; SVE2-NEXT: mov z3.d, z0.d
164-
; SVE2-NEXT: fmov s0, s1
165-
; SVE2-NEXT: fmov s3, s2
166-
; SVE2-NEXT: bif v0.16b, v3.16b, v4.16b
167-
; SVE2-NEXT: str s0, [x0]
149+
; SVE2-NEXT: mov z0.s, #0x7fffffff
150+
; SVE2-NEXT: ldr s1, [x1]
151+
; SVE2-NEXT: ldr s2, [x0]
152+
; SVE2-NEXT: bsl z2.d, z2.d, z1.d, z0.d
153+
; SVE2-NEXT: str s2, [x0]
168154
; SVE2-NEXT: ret
169155
;
170156
; NONEON-NOSVE-LABEL: test_copysign_f32:
@@ -187,36 +173,21 @@ define void @test_copysign_f32(ptr %ap, ptr %bp) {
187173
define void @test_copysign_f64(ptr %ap, ptr %bp) {
188174
; SVE-LABEL: test_copysign_f64:
189175
; SVE: // %bb.0:
190-
; SVE-NEXT: adrp x8, .LCPI3_1
191-
; SVE-NEXT: ptrue p0.d, vl2
192-
; SVE-NEXT: ldr d2, [x0]
193-
; SVE-NEXT: ldr q0, [x8, :lo12:.LCPI3_1]
194-
; SVE-NEXT: adrp x8, .LCPI3_0
195-
; SVE-NEXT: ldr d3, [x1]
196-
; SVE-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
197-
; SVE-NEXT: fneg z0.d, p0/m, z0.d
198-
; SVE-NEXT: mov z4.d, z1.d
199-
; SVE-NEXT: fmov d1, d2
200-
; SVE-NEXT: fmov d4, d3
201-
; SVE-NEXT: bsl v0.16b, v1.16b, v4.16b
176+
; SVE-NEXT: ldr d0, [x1]
177+
; SVE-NEXT: ldr d1, [x0]
178+
; SVE-NEXT: and z0.d, z0.d, #0x8000000000000000
179+
; SVE-NEXT: and z1.d, z1.d, #0x7fffffffffffffff
180+
; SVE-NEXT: orr z0.d, z1.d, z0.d
202181
; SVE-NEXT: str d0, [x0]
203182
; SVE-NEXT: ret
204183
;
205184
; SVE2-LABEL: test_copysign_f64:
206185
; SVE2: // %bb.0:
207-
; SVE2-NEXT: adrp x8, .LCPI3_1
208-
; SVE2-NEXT: ptrue p0.d, vl2
186+
; SVE2-NEXT: mov z0.d, #0x7fffffffffffffff
187+
; SVE2-NEXT: ldr d1, [x1]
209188
; SVE2-NEXT: ldr d2, [x0]
210-
; SVE2-NEXT: ldr q0, [x8, :lo12:.LCPI3_1]
211-
; SVE2-NEXT: adrp x8, .LCPI3_0
212-
; SVE2-NEXT: ldr d3, [x1]
213-
; SVE2-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
214-
; SVE2-NEXT: fneg z0.d, p0/m, z0.d
215-
; SVE2-NEXT: mov z4.d, z1.d
216-
; SVE2-NEXT: fmov d1, d2
217-
; SVE2-NEXT: fmov d4, d3
218-
; SVE2-NEXT: bsl v0.16b, v1.16b, v4.16b
219-
; SVE2-NEXT: str d0, [x0]
189+
; SVE2-NEXT: bsl z2.d, z2.d, z1.d, z0.d
190+
; SVE2-NEXT: str d2, [x0]
220191
; SVE2-NEXT: ret
221192
;
222193
; NONEON-NOSVE-LABEL: test_copysign_f64:

0 commit comments

Comments
 (0)