Skip to content

Commit dc9c5c1

Browse files
committed
[AArch64] MI Scheduler LDP/STP combine follow up
This is a follow up of 75d820d, adding more opcodes to the combine target hook enabling more LDP/STP creation. Patch co-authored by Cameron McInally.
1 parent 77f2ccb commit dc9c5c1

File tree

3 files changed

+178
-11
lines changed

3 files changed

+178
-11
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4214,6 +4214,27 @@ static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
42144214
switch (FirstOpc) {
42154215
default:
42164216
return false;
4217+
case AArch64::STRSui:
4218+
case AArch64::STURSi:
4219+
return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
4220+
case AArch64::STRDui:
4221+
case AArch64::STURDi:
4222+
return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
4223+
case AArch64::STRQui:
4224+
case AArch64::STURQi:
4225+
return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
4226+
case AArch64::STRWui:
4227+
case AArch64::STURWi:
4228+
return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
4229+
case AArch64::STRXui:
4230+
case AArch64::STURXi:
4231+
return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
4232+
case AArch64::LDRSui:
4233+
case AArch64::LDURSi:
4234+
return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
4235+
case AArch64::LDRDui:
4236+
case AArch64::LDURDi:
4237+
return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
42174238
case AArch64::LDRQui:
42184239
case AArch64::LDURQi:
42194240
return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
@@ -4223,6 +4244,9 @@ static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
42234244
case AArch64::LDRSWui:
42244245
case AArch64::LDURSWi:
42254246
return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
4247+
case AArch64::LDRXui:
4248+
case AArch64::LDURXi:
4249+
return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
42264250
}
42274251
// These instructions can't be paired based on their opcodes.
42284252
return false;

llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll

Lines changed: 144 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; REQUIRES: asserts
2-
; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
2+
; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefixes=CHECK,CHECK-A57
33
; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m3 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
44

55
; Test ldr clustering.
@@ -114,6 +114,22 @@ define <2 x i64> @ldq_cluster(ptr %p) {
114114
ret <2 x i64> %res
115115
}
116116

117+
; CHECK: ********** MI Scheduling **********
118+
; CHECK: LDURSi_LDRSui:%bb.0 entry
119+
; CHECK: Cluster ld/st SU(3) - SU(4)
120+
; CHECK: SU(3): %3:fpr32 = LDURSi %0:gpr64
121+
; CHECK: SU(4): %4:fpr32 = LDRSui %0:gpr64
122+
;
123+
define void @LDURSi_LDRSui(ptr nocapture readonly %arg, ptr nocapture readonly %wa, ptr nocapture readonly %wb) {
124+
entry:
125+
%r51 = getelementptr i8, ptr %arg, i64 -4
126+
%r52 = load float, ptr %r51, align 4
127+
%r53 = load float, ptr %arg, align 4
128+
store float %r52, ptr %wa
129+
store float %r53, ptr %wb
130+
ret void
131+
}
132+
117133
; Test LDURQi / LDRQui clustering
118134
;
119135
; CHECK: ********** MI Scheduling **********
@@ -154,3 +170,130 @@ vector_body:
154170
exit:
155171
ret void
156172
}
173+
174+
; Test LDURDi / LDRDui clustering
175+
;
176+
; CHECK: ********** MI Scheduling **********
177+
; CHECK: LDURDi_LDRDui:%bb.1 vector_body
178+
;
179+
; CHECK: Cluster ld/st SU(2) - SU(6)
180+
; CHECK: Cluster ld/st SU(3) - SU(7)
181+
;
182+
; CHECK: SU(2): %{{[0-9]+}}:fpr64 = LDURDi
183+
; CHECK: SU(3): %{{[0-9]+}}:fpr64 = LDURDi
184+
; CHECK: SU(6): %{{[0-9]+}}:fpr64 = LDRDui
185+
; CHECK: SU(7): %{{[0-9]+}}:fpr64 = LDRDui
186+
;
187+
define void @LDURDi_LDRDui(ptr nocapture readonly %arg) {
188+
entry:
189+
br label %vector_body
190+
vector_body:
191+
%phi1 = phi ptr [ null, %entry ], [ %r63, %vector_body ]
192+
%phi2 = phi ptr [ %arg, %entry ], [ %r62, %vector_body ]
193+
%phi3 = phi i32 [ 0, %entry ], [ %r61, %vector_body ]
194+
%r51 = getelementptr i8, ptr %phi1, i64 -8
195+
%r52 = load <2 x float>, ptr %r51, align 8
196+
%r53 = getelementptr i8, ptr %phi2, i64 -8
197+
%r54 = load <2 x float>, ptr %r53, align 8
198+
%r55 = fmul fast <2 x float> %r54, <float 3.0, float 4.0>
199+
%r56 = fsub fast <2 x float> %r52, %r55
200+
store <2 x float> %r56, ptr %r51, align 1
201+
%r57 = load <2 x float>, ptr %phi1, align 8
202+
%r58 = load <2 x float>, ptr %phi2, align 8
203+
%r59 = fmul fast <2 x float> %r58, <float 3.0, float 4.0>
204+
%r60 = fsub fast <2 x float> %r57, %r59
205+
store <2 x float> %r60, ptr %phi1, align 1
206+
%r61 = add i32 %phi3, 4
207+
%r62 = getelementptr i8, ptr %phi2, i64 32
208+
%r63 = getelementptr i8, ptr %phi1, i64 32
209+
%r.not = icmp eq i32 %r61, 0
210+
br i1 %r.not, label %exit, label %vector_body
211+
exit:
212+
ret void
213+
}
214+
215+
; CHECK: ********** MI Scheduling **********
216+
; CHECK: LDURXi_LDRXui:%bb.0 entry
217+
; CHECK: Cluster ld/st SU(3) - SU(4)
218+
; CHECK: SU(3): %{{[0-9]+}}:gpr64 = LDURXi
219+
; CHECK: SU(4): %{{[0-9]+}}:gpr64 = LDRXui
220+
;
221+
define void @LDURXi_LDRXui(ptr nocapture readonly %arg, ptr nocapture readonly %wa, ptr nocapture readonly %wb) {
222+
entry:
223+
%r51 = getelementptr i8, ptr %arg, i64 -8
224+
%r52 = load i64, ptr %r51, align 8
225+
%r53 = load i64, ptr %arg, align 8
226+
store i64 %r52, ptr %wa
227+
store i64 %r53, ptr %wb
228+
ret void
229+
}
230+
231+
; CHECK: ********** MI Scheduling **********
232+
; CHECK: STURWi_STRWui:%bb.0 entry
233+
; CHECK: Cluster ld/st SU(3) - SU(4)
234+
; CHECK: SU(3): STURWi %{{[0-9]+}}:gpr32
235+
; CHECK: SU(4): STRWui %{{[0-9]+}}:gpr32
236+
;
237+
define void @STURWi_STRWui(ptr nocapture readonly %arg, i32 %b, i32 %c) {
238+
entry:
239+
%r51 = getelementptr i8, ptr %arg, i64 -4
240+
store i32 %b, ptr %r51
241+
store i32 %c, ptr %arg
242+
ret void
243+
}
244+
245+
; CHECK: ********** MI Scheduling **********
246+
; CHECK: STURXi_STRXui:%bb.0 entry
247+
; CHECK: Cluster ld/st SU(3) - SU(4)
248+
; CHECK: SU(3): STURXi %{{[0-9]+}}:gpr64
249+
; CHECK: SU(4): STRXui %{{[0-9]+}}:gpr64
250+
;
251+
define void @STURXi_STRXui(ptr nocapture readonly %arg, i64 %b, i64 %c) {
252+
entry:
253+
%r51 = getelementptr i8, ptr %arg, i64 -8
254+
store i64 %b, ptr %r51
255+
store i64 %c, ptr %arg
256+
ret void
257+
}
258+
259+
; CHECK-A57: ********** MI Scheduling **********
260+
; CHECK-A57: STURSi_STRSui:%bb.0 entry
261+
; CHECK-A57: Cluster ld/st SU(3) - SU(4)
262+
; CHECK-A57: SU(3): STURSi %{{[0-9]+}}:fpr32
263+
; CHECK-A57: SU(4): STRSui %{{[0-9]+}}:fpr32
264+
;
265+
define void @STURSi_STRSui(ptr nocapture readonly %arg, float %b, float %c) {
266+
entry:
267+
%r51 = getelementptr i8, ptr %arg, i64 -4
268+
store float %b, ptr %r51
269+
store float %c, ptr %arg
270+
ret void
271+
}
272+
273+
; CHECK-A57: ********** MI Scheduling **********
274+
; CHECK-A57: STURDi_STRDui:%bb.0 entry
275+
; CHECK-A57: Cluster ld/st SU(3) - SU(4)
276+
; CHECK-A57: SU(3): STURDi %{{[0-9]+}}:fpr64
277+
; CHECK-A57: SU(4): STRDui %{{[0-9]+}}:fpr64
278+
;
279+
define void @STURDi_STRDui(ptr nocapture readonly %arg, <2 x float> %b, <2 x float> %c) {
280+
entry:
281+
%r51 = getelementptr i8, ptr %arg, i64 -8
282+
store <2 x float> %b, ptr %r51
283+
store <2 x float> %c, ptr %arg
284+
ret void
285+
}
286+
287+
; CHECK-A57: ********** MI Scheduling **********
288+
; CHECK-A57: STURQi_STRQui:%bb.0 entry
289+
; CHECK-A57: Cluster ld/st SU(3) - SU(4)
290+
; CHECK-A57: SU(3): STURQi %{{[0-9]+}}:fpr128
291+
; CHECK-A57: SU(4): STRQui %{{[0-9]+}}:fpr128
292+
;
293+
define void @STURQi_STRQui(ptr nocapture readonly %arg, <2 x double> %b, <2 x double> %c) {
294+
entry:
295+
%r51 = getelementptr i8, ptr %arg, i64 -16
296+
store <2 x double> %b, ptr %r51
297+
store <2 x double> %c, ptr %arg
298+
ret void
299+
}

llvm/test/CodeGen/AArch64/zext-to-tbl.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1680,31 +1680,31 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst)
16801680
; CHECK-NEXT: add x9, x0, #8
16811681
; CHECK-NEXT: LBB17_1: ; %loop
16821682
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
1683-
; CHECK-NEXT: ldp d2, d4, [x9, #-8]
1683+
; CHECK-NEXT: ldp d2, d3, [x9, #-8]
16841684
; CHECK-NEXT: add x10, x1, x8
16851685
; CHECK-NEXT: ldp q6, q5, [x10, #32]
16861686
; CHECK-NEXT: add x8, x8, #128
16871687
; CHECK-NEXT: ldp q17, q16, [x10]
16881688
; CHECK-NEXT: cmp x8, #1024
1689-
; CHECK-NEXT: tbl.16b v3, { v2 }, v1
1689+
; CHECK-NEXT: tbl.16b v4, { v2 }, v1
16901690
; CHECK-NEXT: tbl.16b v2, { v2 }, v0
1691-
; CHECK-NEXT: tbl.16b v7, { v4 }, v1
1692-
; CHECK-NEXT: tbl.16b v4, { v4 }, v0
1691+
; CHECK-NEXT: tbl.16b v7, { v3 }, v1
1692+
; CHECK-NEXT: tbl.16b v3, { v3 }, v0
16931693
; CHECK-NEXT: add x9, x9, #16
1694-
; CHECK-NEXT: uaddw2.2d v5, v5, v3
1695-
; CHECK-NEXT: uaddw.2d v3, v6, v3
1694+
; CHECK-NEXT: uaddw2.2d v5, v5, v4
1695+
; CHECK-NEXT: uaddw.2d v4, v6, v4
16961696
; CHECK-NEXT: uaddw2.2d v6, v16, v2
16971697
; CHECK-NEXT: ldp q18, q16, [x10, #96]
16981698
; CHECK-NEXT: uaddw.2d v2, v17, v2
1699-
; CHECK-NEXT: stp q3, q5, [x10, #32]
1699+
; CHECK-NEXT: stp q4, q5, [x10, #32]
17001700
; CHECK-NEXT: ldp q17, q5, [x10, #64]
17011701
; CHECK-NEXT: uaddw2.2d v16, v16, v7
17021702
; CHECK-NEXT: uaddw.2d v7, v18, v7
17031703
; CHECK-NEXT: stp q2, q6, [x10]
1704-
; CHECK-NEXT: uaddw2.2d v3, v5, v4
1705-
; CHECK-NEXT: uaddw.2d v4, v17, v4
1704+
; CHECK-NEXT: uaddw2.2d v4, v5, v3
1705+
; CHECK-NEXT: uaddw.2d v3, v17, v3
17061706
; CHECK-NEXT: stp q7, q16, [x10, #96]
1707-
; CHECK-NEXT: stp q4, q3, [x10, #64]
1707+
; CHECK-NEXT: stp q3, q4, [x10, #64]
17081708
; CHECK-NEXT: b.ne LBB17_1
17091709
; CHECK-NEXT: ; %bb.2: ; %exit
17101710
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)