Skip to content

Commit 147c5d6

Browse files
authored
[AArch64] Allow LDR merge with same destination register by renaming (#71908)
The patch is based on a reverted patch: https://reviews.llvm.org/D103597. It was trying to rename registers before alias check, which is not safe and causes miscompiles. This patch does 2 things: 1. Do the renaming with necessary checks passed, including alias check. 2. Rename the register for the instructions between the pairs and combine the second load into the first. By doing so we can just check the renamability between the pairs and avoid scanning unknown amount of instructions before/after the pairs. Necessary refactoring has been made in order to reuse as much code possible with STR renaming.
1 parent a3cab1f commit 147c5d6

File tree

11 files changed

+490
-299
lines changed

11 files changed

+490
-299
lines changed

llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

Lines changed: 180 additions & 80 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,9 +125,8 @@ define dso_local void @test_va_copy() {
125125

126126
; CHECK: add x[[SRC:[0-9]+]], {{x[0-9]+}}, :lo12:var
127127

128-
; CHECK: ldr [[BLOCKB:q[0-9]+]], [x[[SRC]], #16]
128+
; CHECK: ldp [[BLOCKA:q[0-9]+]], [[BLOCKB:q[0-9]+]], [x[[SRC]]]
129129
; CHECK: add x[[DST:[0-9]+]], {{x[0-9]+}}, :lo12:second_list
130-
; CHECK: ldr [[BLOCKA:q[0-9]+]], [x[[SRC]]]
131130
; CHECK: stp [[BLOCKA]], [[BLOCKB]], [x[[DST]]]
132131
ret void
133132
; CHECK: ret

llvm/test/CodeGen/AArch64/fexplog.ll

Lines changed: 40 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -713,14 +713,12 @@ define <7 x half> @exp_v7f16(<7 x half> %a) {
713713
; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
714714
; CHECK-GI-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload
715715
; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
716-
; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload
716+
; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload
717717
; CHECK-GI-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload
718-
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
719-
; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload
718+
; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
720719
; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
721-
; CHECK-GI-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload
722-
; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
723-
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
720+
; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload
721+
; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
724722
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
725723
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
726724
; CHECK-GI-NEXT: mov v1.h[7], v0.h[0]
@@ -963,14 +961,12 @@ define <8 x half> @exp_v8f16(<8 x half> %a) {
963961
; CHECK-GI-NEXT: ldp d11, d10, [sp, #136] // 16-byte Folded Reload
964962
; CHECK-GI-NEXT: ldr d14, [sp, #112] // 8-byte Folded Reload
965963
; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
966-
; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload
964+
; CHECK-GI-NEXT: ldp q2, q3, [sp, #48] // 32-byte Folded Reload
967965
; CHECK-GI-NEXT: ldp d13, d12, [sp, #120] // 16-byte Folded Reload
968-
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
969-
; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload
966+
; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
970967
; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
971-
; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload
972-
; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
973-
; CHECK-GI-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload
968+
; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
969+
; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
974970
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
975971
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
976972
; CHECK-GI-NEXT: mov v1.h[6], v2.h[0]
@@ -1994,14 +1990,12 @@ define <7 x half> @exp2_v7f16(<7 x half> %a) {
19941990
; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
19951991
; CHECK-GI-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload
19961992
; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
1997-
; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload
1993+
; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload
19981994
; CHECK-GI-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload
1999-
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
2000-
; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload
1995+
; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
20011996
; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
2002-
; CHECK-GI-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload
2003-
; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
2004-
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
1997+
; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload
1998+
; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
20051999
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
20062000
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
20072001
; CHECK-GI-NEXT: mov v1.h[7], v0.h[0]
@@ -2244,14 +2238,12 @@ define <8 x half> @exp2_v8f16(<8 x half> %a) {
22442238
; CHECK-GI-NEXT: ldp d11, d10, [sp, #136] // 16-byte Folded Reload
22452239
; CHECK-GI-NEXT: ldr d14, [sp, #112] // 8-byte Folded Reload
22462240
; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
2247-
; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload
2241+
; CHECK-GI-NEXT: ldp q2, q3, [sp, #48] // 32-byte Folded Reload
22482242
; CHECK-GI-NEXT: ldp d13, d12, [sp, #120] // 16-byte Folded Reload
2249-
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
2250-
; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload
2243+
; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
22512244
; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
2252-
; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload
2253-
; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
2254-
; CHECK-GI-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload
2245+
; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
2246+
; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
22552247
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
22562248
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
22572249
; CHECK-GI-NEXT: mov v1.h[6], v2.h[0]
@@ -3275,14 +3267,12 @@ define <7 x half> @log_v7f16(<7 x half> %a) {
32753267
; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
32763268
; CHECK-GI-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload
32773269
; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
3278-
; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload
3270+
; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload
32793271
; CHECK-GI-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload
3280-
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
3281-
; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload
3272+
; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
32823273
; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
3283-
; CHECK-GI-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload
3284-
; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
3285-
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
3274+
; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload
3275+
; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
32863276
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
32873277
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
32883278
; CHECK-GI-NEXT: mov v1.h[7], v0.h[0]
@@ -3525,14 +3515,12 @@ define <8 x half> @log_v8f16(<8 x half> %a) {
35253515
; CHECK-GI-NEXT: ldp d11, d10, [sp, #136] // 16-byte Folded Reload
35263516
; CHECK-GI-NEXT: ldr d14, [sp, #112] // 8-byte Folded Reload
35273517
; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
3528-
; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload
3518+
; CHECK-GI-NEXT: ldp q2, q3, [sp, #48] // 32-byte Folded Reload
35293519
; CHECK-GI-NEXT: ldp d13, d12, [sp, #120] // 16-byte Folded Reload
3530-
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
3531-
; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload
3520+
; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
35323521
; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
3533-
; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload
3534-
; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
3535-
; CHECK-GI-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload
3522+
; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
3523+
; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
35363524
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
35373525
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
35383526
; CHECK-GI-NEXT: mov v1.h[6], v2.h[0]
@@ -4556,14 +4544,12 @@ define <7 x half> @log2_v7f16(<7 x half> %a) {
45564544
; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
45574545
; CHECK-GI-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload
45584546
; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
4559-
; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload
4547+
; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload
45604548
; CHECK-GI-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload
4561-
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
4562-
; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload
4549+
; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
45634550
; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
4564-
; CHECK-GI-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload
4565-
; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
4566-
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
4551+
; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload
4552+
; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
45674553
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
45684554
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
45694555
; CHECK-GI-NEXT: mov v1.h[7], v0.h[0]
@@ -4806,14 +4792,12 @@ define <8 x half> @log2_v8f16(<8 x half> %a) {
48064792
; CHECK-GI-NEXT: ldp d11, d10, [sp, #136] // 16-byte Folded Reload
48074793
; CHECK-GI-NEXT: ldr d14, [sp, #112] // 8-byte Folded Reload
48084794
; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
4809-
; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload
4795+
; CHECK-GI-NEXT: ldp q2, q3, [sp, #48] // 32-byte Folded Reload
48104796
; CHECK-GI-NEXT: ldp d13, d12, [sp, #120] // 16-byte Folded Reload
4811-
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
4812-
; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload
4797+
; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
48134798
; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
4814-
; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload
4815-
; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
4816-
; CHECK-GI-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload
4799+
; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
4800+
; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
48174801
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
48184802
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
48194803
; CHECK-GI-NEXT: mov v1.h[6], v2.h[0]
@@ -5837,14 +5821,12 @@ define <7 x half> @log10_v7f16(<7 x half> %a) {
58375821
; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
58385822
; CHECK-GI-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload
58395823
; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
5840-
; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload
5824+
; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload
58415825
; CHECK-GI-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload
5842-
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
5843-
; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload
5826+
; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
58445827
; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
5845-
; CHECK-GI-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload
5846-
; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
5847-
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
5828+
; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload
5829+
; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
58485830
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
58495831
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
58505832
; CHECK-GI-NEXT: mov v1.h[7], v0.h[0]
@@ -6087,14 +6069,12 @@ define <8 x half> @log10_v8f16(<8 x half> %a) {
60876069
; CHECK-GI-NEXT: ldp d11, d10, [sp, #136] // 16-byte Folded Reload
60886070
; CHECK-GI-NEXT: ldr d14, [sp, #112] // 8-byte Folded Reload
60896071
; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
6090-
; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload
6072+
; CHECK-GI-NEXT: ldp q2, q3, [sp, #48] // 32-byte Folded Reload
60916073
; CHECK-GI-NEXT: ldp d13, d12, [sp, #120] // 16-byte Folded Reload
6092-
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
6093-
; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload
6074+
; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
60946075
; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
6095-
; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload
6096-
; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
6097-
; CHECK-GI-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload
6076+
; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
6077+
; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
60986078
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
60996079
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
61006080
; CHECK-GI-NEXT: mov v1.h[6], v2.h[0]

llvm/test/CodeGen/AArch64/fpow.ll

Lines changed: 34 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -267,14 +267,13 @@ define <4 x double> @pow_v4f64(<4 x double> %a, <4 x double> %b) {
267267
; CHECK-GI-NEXT: fmov d1, d11
268268
; CHECK-GI-NEXT: fmov d0, d9
269269
; CHECK-GI-NEXT: bl pow
270-
; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload
271-
; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
270+
; CHECK-GI-NEXT: ldp q3, q1, [sp, #16] // 32-byte Folded Reload
272271
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
273-
; CHECK-GI-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
272+
; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload
274273
; CHECK-GI-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
275-
; CHECK-GI-NEXT: mov v2.d[1], v1.d[0]
276-
; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
274+
; CHECK-GI-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
277275
; CHECK-GI-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
276+
; CHECK-GI-NEXT: mov v2.d[1], v3.d[0]
278277
; CHECK-GI-NEXT: mov v1.d[1], v0.d[0]
279278
; CHECK-GI-NEXT: mov v0.16b, v2.16b
280279
; CHECK-GI-NEXT: add sp, sp, #112
@@ -715,14 +714,14 @@ define <7 x half> @pow_v7f16(<7 x half> %a, <7 x half> %b) {
715714
; CHECK-SD-NEXT: fcvt s1, h1
716715
; CHECK-SD-NEXT: bl powf
717716
; CHECK-SD-NEXT: fcvt h0, s0
718-
; CHECK-SD-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
719-
; CHECK-SD-NEXT: mov v0.h[1], v1.h[0]
717+
; CHECK-SD-NEXT: ldp q1, q2, [sp, #16] // 32-byte Folded Reload
718+
; CHECK-SD-NEXT: mov h1, v1.h[2]
719+
; CHECK-SD-NEXT: mov v0.h[1], v2.h[0]
720+
; CHECK-SD-NEXT: fcvt s1, h1
720721
; CHECK-SD-NEXT: str q0, [sp, #32] // 16-byte Folded Spill
721-
; CHECK-SD-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload
722+
; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload
722723
; CHECK-SD-NEXT: mov h0, v0.h[2]
723-
; CHECK-SD-NEXT: mov h1, v1.h[2]
724724
; CHECK-SD-NEXT: fcvt s0, h0
725-
; CHECK-SD-NEXT: fcvt s1, h1
726725
; CHECK-SD-NEXT: bl powf
727726
; CHECK-SD-NEXT: fcvt h0, s0
728727
; CHECK-SD-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
@@ -858,14 +857,13 @@ define <7 x half> @pow_v7f16(<7 x half> %a, <7 x half> %b) {
858857
; CHECK-GI-NEXT: fcvt s1, h0
859858
; CHECK-GI-NEXT: fmov s0, s2
860859
; CHECK-GI-NEXT: bl powf
861-
; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload
862-
; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload
860+
; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
863861
; CHECK-GI-NEXT: fcvt h0, s0
862+
; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload
864863
; CHECK-GI-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload
865-
; CHECK-GI-NEXT: ldr x30, [sp, #160] // 8-byte Folded Reload
866-
; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
867-
; CHECK-GI-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload
868864
; CHECK-GI-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload
865+
; CHECK-GI-NEXT: ldr x30, [sp, #160] // 8-byte Folded Reload
866+
; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
869867
; CHECK-GI-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload
870868
; CHECK-GI-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload
871869
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
@@ -1020,14 +1018,14 @@ define <8 x half> @pow_v8f16(<8 x half> %a, <8 x half> %b) {
10201018
; CHECK-SD-NEXT: fcvt s1, h1
10211019
; CHECK-SD-NEXT: bl powf
10221020
; CHECK-SD-NEXT: fcvt h0, s0
1023-
; CHECK-SD-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
1024-
; CHECK-SD-NEXT: mov v0.h[1], v1.h[0]
1021+
; CHECK-SD-NEXT: ldp q1, q2, [sp, #16] // 32-byte Folded Reload
1022+
; CHECK-SD-NEXT: mov h1, v1.h[2]
1023+
; CHECK-SD-NEXT: mov v0.h[1], v2.h[0]
1024+
; CHECK-SD-NEXT: fcvt s1, h1
10251025
; CHECK-SD-NEXT: str q0, [sp, #32] // 16-byte Folded Spill
1026-
; CHECK-SD-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload
1026+
; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload
10271027
; CHECK-SD-NEXT: mov h0, v0.h[2]
1028-
; CHECK-SD-NEXT: mov h1, v1.h[2]
10291028
; CHECK-SD-NEXT: fcvt s0, h0
1030-
; CHECK-SD-NEXT: fcvt s1, h1
10311029
; CHECK-SD-NEXT: bl powf
10321030
; CHECK-SD-NEXT: fcvt h0, s0
10331031
; CHECK-SD-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
@@ -1175,22 +1173,20 @@ define <8 x half> @pow_v8f16(<8 x half> %a, <8 x half> %b) {
11751173
; CHECK-GI-NEXT: fcvt s1, h0
11761174
; CHECK-GI-NEXT: fmov s0, s2
11771175
; CHECK-GI-NEXT: bl powf
1178-
; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload
1179-
; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload
1176+
; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
11801177
; CHECK-GI-NEXT: fcvt h0, s0
1178+
; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload
11811179
; CHECK-GI-NEXT: ldp d9, d8, [sp, #160] // 16-byte Folded Reload
1182-
; CHECK-GI-NEXT: ldr x30, [sp, #176] // 8-byte Folded Reload
1183-
; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
1184-
; CHECK-GI-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload
11851180
; CHECK-GI-NEXT: ldp d11, d10, [sp, #144] // 16-byte Folded Reload
1181+
; CHECK-GI-NEXT: ldr x30, [sp, #176] // 8-byte Folded Reload
1182+
; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
11861183
; CHECK-GI-NEXT: ldp d13, d12, [sp, #128] // 16-byte Folded Reload
11871184
; CHECK-GI-NEXT: ldp d15, d14, [sp, #112] // 16-byte Folded Reload
11881185
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
11891186
; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
11901187
; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
1191-
; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload
1192-
; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
1193-
; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload
1188+
; CHECK-GI-NEXT: ldp q3, q2, [sp, #48] // 32-byte Folded Reload
1189+
; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
11941190
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
11951191
; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload
11961192
; CHECK-GI-NEXT: mov v1.h[6], v2.h[0]
@@ -1225,14 +1221,13 @@ define <16 x half> @pow_v16f16(<16 x half> %a, <16 x half> %b) {
12251221
; CHECK-SD-NEXT: fcvt s0, h0
12261222
; CHECK-SD-NEXT: bl powf
12271223
; CHECK-SD-NEXT: fcvt h0, s0
1228-
; CHECK-SD-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload
1229-
; CHECK-SD-NEXT: mov v0.h[1], v1.h[0]
1230-
; CHECK-SD-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload
1224+
; CHECK-SD-NEXT: ldp q1, q2, [sp, #48] // 32-byte Folded Reload
12311225
; CHECK-SD-NEXT: mov h1, v1.h[2]
1226+
; CHECK-SD-NEXT: mov v0.h[1], v2.h[0]
1227+
; CHECK-SD-NEXT: fcvt s1, h1
12321228
; CHECK-SD-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
12331229
; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload
12341230
; CHECK-SD-NEXT: mov h0, v0.h[2]
1235-
; CHECK-SD-NEXT: fcvt s1, h1
12361231
; CHECK-SD-NEXT: fcvt s0, h0
12371232
; CHECK-SD-NEXT: bl powf
12381233
; CHECK-SD-NEXT: fcvt h0, s0
@@ -1307,14 +1302,14 @@ define <16 x half> @pow_v16f16(<16 x half> %a, <16 x half> %b) {
13071302
; CHECK-SD-NEXT: fcvt s1, h1
13081303
; CHECK-SD-NEXT: bl powf
13091304
; CHECK-SD-NEXT: fcvt h0, s0
1310-
; CHECK-SD-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload
1311-
; CHECK-SD-NEXT: mov v0.h[1], v1.h[0]
1305+
; CHECK-SD-NEXT: ldp q1, q2, [sp, #32] // 32-byte Folded Reload
1306+
; CHECK-SD-NEXT: mov h1, v1.h[2]
1307+
; CHECK-SD-NEXT: mov v0.h[1], v2.h[0]
1308+
; CHECK-SD-NEXT: fcvt s1, h1
13121309
; CHECK-SD-NEXT: str q0, [sp, #48] // 16-byte Folded Spill
1313-
; CHECK-SD-NEXT: ldp q0, q1, [sp, #16] // 32-byte Folded Reload
1310+
; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
13141311
; CHECK-SD-NEXT: mov h0, v0.h[2]
1315-
; CHECK-SD-NEXT: mov h1, v1.h[2]
13161312
; CHECK-SD-NEXT: fcvt s0, h0
1317-
; CHECK-SD-NEXT: fcvt s1, h1
13181313
; CHECK-SD-NEXT: bl powf
13191314
; CHECK-SD-NEXT: fcvt h0, s0
13201315
; CHECK-SD-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload
@@ -1578,9 +1573,8 @@ define <16 x half> @pow_v16f16(<16 x half> %a, <16 x half> %b) {
15781573
; CHECK-GI-NEXT: mov v3.h[3], v2.h[0]
15791574
; CHECK-GI-NEXT: ldr q2, [sp, #192] // 16-byte Folded Reload
15801575
; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
1581-
; CHECK-GI-NEXT: ldr q2, [sp, #240] // 16-byte Folded Reload
1582-
; CHECK-GI-NEXT: mov v3.h[4], v2.h[0]
1583-
; CHECK-GI-NEXT: ldr q2, [sp, #256] // 16-byte Folded Reload
1576+
; CHECK-GI-NEXT: ldp q4, q2, [sp, #240] // 32-byte Folded Reload
1577+
; CHECK-GI-NEXT: mov v3.h[4], v4.h[0]
15841578
; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
15851579
; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload
15861580
; CHECK-GI-NEXT: mov v3.h[5], v2.h[0]

0 commit comments

Comments
 (0)