Skip to content

Commit 75d820d

Browse files
authored
[AArch64] MI Scheduler: create more LDP/STP pairs (#77565)
Target hook `canPairLdStOpc` is missing quite a few opcodes for which LDPs/STPs can created. I was hoping that it would not be necessary to add these missing opcodes here and that the attached motivating test case would be handled by the LoadStoreOptimiser (especially after #71908), but it's not. The problem is that after register allocation some things are a lot harder to do. Consider this for the motivating example ``` [1] renamable $q1 = LDURQi renamable $x9, -16 :: (load (s128) from %ir.r51, align 8, !tbaa !0) [2] renamable $q2 = LDURQi renamable $x0, -16 :: (load (s128) from %ir.r53, align 8, !tbaa !4) [3] renamable $q1 = nnan ninf nsz arcp contract afn reassoc nofpexcept FMLSv2f64 killed renamable $q1(tied-def 0), killed renamable $q2, renamable $q0, implicit $fpcr [4] STURQi killed renamable $q1, renamable $x9, -16 :: (store (s128) into %ir.r51, align 1, !tbaa !0) [5] renamable $q1 = LDRQui renamable $x9, 0 :: (load (s128) from %ir.r.G0001_609.0, align 8, !tbaa !0) ``` We can't combine the the load in line [5] into the load on [1]: regisister q1 is used in between. And we can can't combine [1] into [5]: it is aliasing with the STR on line [4]. So, adding some missing opcodes here seems the best/easiest approach. I will follow up to add some more missing cases here.
1 parent 7bf13fe commit 75d820d

File tree

3 files changed

+48
-4
lines changed

3 files changed

+48
-4
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4214,6 +4214,9 @@ static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
42144214
switch (FirstOpc) {
42154215
default:
42164216
return false;
4217+
case AArch64::LDRQui:
4218+
case AArch64::LDURQi:
4219+
return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
42174220
case AArch64::LDRWui:
42184221
case AArch64::LDURWi:
42194222
return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;

llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,44 @@ define <2 x i64> @ldq_cluster(ptr %p) {
113113
%res = mul nsw <2 x i64> %tmp2, %tmp3
114114
ret <2 x i64> %res
115115
}
116+
117+
; Test LDURQi / LDRQui clustering
118+
;
119+
; CHECK: ********** MI Scheduling **********
120+
; CHECK: LDURQi_LDRQui:%bb.1 vector_body
121+
;
122+
; CHECK: Cluster ld/st SU(0) - SU(4)
123+
; CHECK: Cluster ld/st SU(1) - SU(5)
124+
;
125+
; CHECK: SU(0): %{{[0-9]+}}:fpr128 = LDURQi
126+
; CHECK: SU(1): %{{[0-9]+}}:fpr128 = LDURQi
127+
; CHECK: SU(4): %{{[0-9]+}}:fpr128 = LDRQui
128+
; CHECK: SU(5): %{{[0-9]+}}:fpr128 = LDRQui
129+
;
130+
define void @LDURQi_LDRQui(ptr nocapture readonly %arg) {
131+
entry:
132+
br label %vector_body
133+
vector_body:
134+
%phi1 = phi ptr [ null, %entry ], [ %r63, %vector_body ]
135+
%phi2 = phi ptr [ %arg, %entry ], [ %r62, %vector_body ]
136+
%phi3 = phi i32 [ 0, %entry ], [ %r61, %vector_body ]
137+
%r51 = getelementptr i8, ptr %phi1, i64 -16
138+
%r52 = load <2 x double>, ptr %r51, align 8
139+
%r53 = getelementptr i8, ptr %phi2, i64 -16
140+
%r54 = load <2 x double>, ptr %r53, align 8
141+
%r55 = fmul fast <2 x double> %r54, <double 3.0, double 4.0>
142+
%r56 = fsub fast <2 x double> %r52, %r55
143+
store <2 x double> %r56, ptr %r51, align 1
144+
%r57 = load <2 x double>, ptr %phi1, align 8
145+
%r58 = load <2 x double>, ptr %phi2, align 8
146+
%r59 = fmul fast <2 x double> %r58,<double 3.0, double 4.0>
147+
%r60 = fsub fast <2 x double> %r57, %r59
148+
store <2 x double> %r60, ptr %phi1, align 1
149+
%r61 = add i32 %phi3, 4
150+
%r62 = getelementptr i8, ptr %phi2, i64 32
151+
%r63 = getelementptr i8, ptr %phi1, i64 32
152+
%r.not = icmp eq i32 %r61, 0
153+
br i1 %r.not, label %exit, label %vector_body
154+
exit:
155+
ret void
156+
}

llvm/test/CodeGen/AArch64/machine-combiner-copy.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,13 @@ define void @fma_dup_f16(ptr noalias nocapture noundef readonly %A, half noundef
2020
; CHECK-NEXT: mov x12, x9
2121
; CHECK-NEXT: .LBB0_4: // %vector.body
2222
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
23-
; CHECK-NEXT: ldp q1, q3, [x11, #-16]
23+
; CHECK-NEXT: ldp q1, q4, [x10, #-16]
2424
; CHECK-NEXT: subs x12, x12, #16
25-
; CHECK-NEXT: ldp q2, q4, [x10, #-16]
25+
; CHECK-NEXT: ldp q2, q3, [x11, #-16]
2626
; CHECK-NEXT: add x11, x11, #32
27-
; CHECK-NEXT: fmla v2.8h, v1.8h, v0.h[0]
27+
; CHECK-NEXT: fmla v1.8h, v2.8h, v0.h[0]
2828
; CHECK-NEXT: fmla v4.8h, v3.8h, v0.h[0]
29-
; CHECK-NEXT: stp q2, q4, [x10, #-16]
29+
; CHECK-NEXT: stp q1, q4, [x10, #-16]
3030
; CHECK-NEXT: add x10, x10, #32
3131
; CHECK-NEXT: b.ne .LBB0_4
3232
; CHECK-NEXT: // %bb.5: // %middle.block

0 commit comments

Comments
 (0)