Skip to content

Commit 39ae86a

Browse files
committed
[AArch64TTI] AArch64 supports NT vector stores through STNP.
This patch adds a custom implementation of isLegalNTStore to AArch64TTI that supports vector types that can be directly stored by STNP. Note that the implementation may not catch all valid cases (e.g. because the vector is a multiple of 256 and could be broken down to multiple valid 256 bit stores), but it is good enough for LV to vectorize loops with NT stores, as LV only passes in a vector with 2 elements to check. LV seems to also be the only user of isLegalNTStore. We should also do the same for NT loads, but before that we need to ensure that we properly lower LDNP of vectors, similar to D72919. Reviewers: dmgreen, samparker, t.p.northover, ab Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D73158
1 parent adc4faf commit 39ae86a

File tree

2 files changed

+277
-0
lines changed

2 files changed

+277
-0
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,24 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
172172
return isLegalMaskedLoadStore(DataType, Alignment);
173173
}
174174

175+
bool isLegalNTStore(Type *DataType, Align Alignment) {
176+
// NOTE: The logic below is mostly geared towards LV, which calls it with
177+
// vectors with 2 elements. We might want to improve that, if other
178+
// users show up.
179+
// Nontemporal vector stores can be directly lowered to STNP, if the vector
180+
// can be halved so that each half fits into a register. That's the case if
181+
// the element type fits into a register and the number of elements is a
182+
// power of 2 > 1.
183+
if (isa<VectorType>(DataType)) {
184+
unsigned NumElements = DataType->getVectorNumElements();
185+
unsigned EltSize =
186+
DataType->getVectorElementType()->getScalarSizeInBits();
187+
return NumElements > 1 && isPowerOf2_64(NumElements) && EltSize >= 8 &&
188+
EltSize <= 128 && isPowerOf2_64(EltSize);
189+
}
190+
return BaseT::isLegalNTStore(DataType, Alignment);
191+
}
192+
175193
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
176194
ArrayRef<unsigned> Indices, unsigned Alignment,
177195
unsigned AddressSpace,
Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,259 @@
1+
; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphones -force-vector-width=4 -force-vector-interleave=1 %s -S | FileCheck %s
2+
3+
; Vectors with i4 elements may not legal with nontemporal stores.
4+
define void @test_i4_store(i4* %ddst) {
5+
; CHECK-LABEL: define void @test_i4_store(
6+
; CHECK-NOT: vector.body:
7+
; CHECK: ret void
8+
;
9+
entry:
10+
br label %for.body
11+
12+
for.body: ; preds = %entry, %for.body
13+
%i = phi i32 [ 0, %entry ], [ %add, %for.body ]
14+
%ddst.addr = phi i4* [ %ddst, %entry ], [ %incdec.ptr, %for.body ]
15+
%incdec.ptr = getelementptr inbounds i4, i4* %ddst.addr, i64 1
16+
store i4 10, i4* %ddst.addr, align 4, !nontemporal !8
17+
%add = add nuw nsw i32 %i, 4
18+
%cmp = icmp ult i32 %i, 4092
19+
br i1 %cmp, label %for.body, label %for.cond.cleanup
20+
21+
for.cond.cleanup: ; preds = %for.body
22+
ret void
23+
}
24+
25+
define void @test_i8_store(i8* %ddst) {
26+
; CHECK-LABEL: define void @test_i8_store(
27+
; CHECK-LABEL: vector.body:
28+
; CHECK: store <4 x i8> {{.*}} !nontemporal !0
29+
; CHECK: br
30+
;
31+
entry:
32+
br label %for.body
33+
34+
for.body: ; preds = %entry, %for.body
35+
%i = phi i32 [ 0, %entry ], [ %add, %for.body ]
36+
%ddst.addr = phi i8* [ %ddst, %entry ], [ %incdec.ptr, %for.body ]
37+
%incdec.ptr = getelementptr inbounds i8, i8* %ddst.addr, i64 1
38+
store i8 10, i8* %ddst.addr, align 4, !nontemporal !8
39+
%add = add nuw nsw i32 %i, 4
40+
%cmp = icmp ult i32 %i, 4092
41+
br i1 %cmp, label %for.body, label %for.cond.cleanup
42+
43+
for.cond.cleanup: ; preds = %for.body
44+
ret void
45+
}
46+
47+
define void @test_half_store(half* %ddst) {
48+
; CHECK-LABEL: define void @test_half_store(
49+
; CHECK-LABEL: vector.body:
50+
; CHECK: store <4 x half> {{.*}} !nontemporal !0
51+
; CHECK: br
52+
;
53+
entry:
54+
br label %for.body
55+
56+
for.body: ; preds = %entry, %for.body
57+
%i = phi i32 [ 0, %entry ], [ %add, %for.body ]
58+
%ddst.addr = phi half* [ %ddst, %entry ], [ %incdec.ptr, %for.body ]
59+
%incdec.ptr = getelementptr inbounds half, half* %ddst.addr, i64 1
60+
store half 10.0, half* %ddst.addr, align 4, !nontemporal !8
61+
%add = add nuw nsw i32 %i, 4
62+
%cmp = icmp ult i32 %i, 4092
63+
br i1 %cmp, label %for.body, label %for.cond.cleanup
64+
65+
for.cond.cleanup: ; preds = %for.body
66+
ret void
67+
}
68+
69+
define void @test_i16_store(i16* %ddst) {
70+
; CHECK-LABEL: define void @test_i16_store(
71+
; CHECK-LABEL: vector.body:
72+
; CHECK: store <4 x i16> {{.*}} !nontemporal !0
73+
; CHECK: br
74+
;
75+
entry:
76+
br label %for.body
77+
78+
for.body: ; preds = %entry, %for.body
79+
%i = phi i32 [ 0, %entry ], [ %add, %for.body ]
80+
%ddst.addr = phi i16* [ %ddst, %entry ], [ %incdec.ptr, %for.body ]
81+
%incdec.ptr = getelementptr inbounds i16, i16* %ddst.addr, i64 1
82+
store i16 10, i16* %ddst.addr, align 4, !nontemporal !8
83+
%add = add nuw nsw i32 %i, 4
84+
%cmp = icmp ult i32 %i, 4092
85+
br i1 %cmp, label %for.body, label %for.cond.cleanup
86+
87+
for.cond.cleanup: ; preds = %for.body
88+
ret void
89+
}
90+
91+
define void @test_i32_store(i32* nocapture %ddst) {
92+
; CHECK-LABEL: define void @test_i32_store(
93+
; CHECK-LABEL: vector.body:
94+
; CHECK: store <16 x i32> {{.*}} !nontemporal !0
95+
; CHECK: br
96+
;
97+
entry:
98+
br label %for.body
99+
100+
for.body: ; preds = %entry, %for.body
101+
%i = phi i32 [ 0, %entry ], [ %add, %for.body ]
102+
%ddst.addr = phi i32* [ %ddst, %entry ], [ %incdec.ptr3, %for.body ]
103+
%incdec.ptr = getelementptr inbounds i32, i32* %ddst.addr, i64 1
104+
store i32 10, i32* %ddst.addr, align 4, !nontemporal !8
105+
%incdec.ptr1 = getelementptr inbounds i32, i32* %ddst.addr, i64 2
106+
store i32 20, i32* %incdec.ptr, align 4, !nontemporal !8
107+
%incdec.ptr2 = getelementptr inbounds i32, i32* %ddst.addr, i64 3
108+
store i32 30, i32* %incdec.ptr1, align 4, !nontemporal !8
109+
%incdec.ptr3 = getelementptr inbounds i32, i32* %ddst.addr, i64 4
110+
store i32 40, i32* %incdec.ptr2, align 4, !nontemporal !8
111+
%add = add nuw nsw i32 %i, 4
112+
%cmp = icmp ult i32 %i, 4092
113+
br i1 %cmp, label %for.body, label %for.cond.cleanup
114+
115+
for.cond.cleanup: ; preds = %for.body
116+
ret void
117+
}
118+
119+
define void @test_i33_store(i33* nocapture %ddst) {
120+
; CHECK-LABEL: define void @test_i33_store(
121+
; CHECK-NOT: vector.body:
122+
; CHECK: ret
123+
;
124+
entry:
125+
br label %for.body
126+
127+
for.body: ; preds = %entry, %for.body
128+
%i = phi i32 [ 0, %entry ], [ %add, %for.body ]
129+
%ddst.addr = phi i33* [ %ddst, %entry ], [ %incdec.ptr3, %for.body ]
130+
%incdec.ptr = getelementptr inbounds i33, i33* %ddst.addr, i64 1
131+
store i33 10, i33* %ddst.addr, align 4, !nontemporal !8
132+
%incdec.ptr1 = getelementptr inbounds i33, i33* %ddst.addr, i64 2
133+
store i33 20, i33* %incdec.ptr, align 4, !nontemporal !8
134+
%incdec.ptr2 = getelementptr inbounds i33, i33* %ddst.addr, i64 3
135+
store i33 30, i33* %incdec.ptr1, align 4, !nontemporal !8
136+
%incdec.ptr3 = getelementptr inbounds i33, i33* %ddst.addr, i64 4
137+
store i33 40, i33* %incdec.ptr2, align 4, !nontemporal !8
138+
%add = add nuw nsw i32 %i, 3
139+
%cmp = icmp ult i32 %i, 4092
140+
br i1 %cmp, label %for.body, label %for.cond.cleanup
141+
142+
for.cond.cleanup: ; preds = %for.body
143+
ret void
144+
}
145+
146+
define void @test_i40_store(i40* nocapture %ddst) {
147+
; CHECK-LABEL: define void @test_i40_store(
148+
; CHECK-NOT: vector.body:
149+
; CHECK: ret
150+
;
151+
entry:
152+
br label %for.body
153+
154+
for.body: ; preds = %entry, %for.body
155+
%i = phi i32 [ 0, %entry ], [ %add, %for.body ]
156+
%ddst.addr = phi i40* [ %ddst, %entry ], [ %incdec.ptr3, %for.body ]
157+
%incdec.ptr = getelementptr inbounds i40, i40* %ddst.addr, i64 1
158+
store i40 10, i40* %ddst.addr, align 4, !nontemporal !8
159+
%incdec.ptr1 = getelementptr inbounds i40, i40* %ddst.addr, i64 2
160+
store i40 20, i40* %incdec.ptr, align 4, !nontemporal !8
161+
%incdec.ptr2 = getelementptr inbounds i40, i40* %ddst.addr, i64 3
162+
store i40 30, i40* %incdec.ptr1, align 4, !nontemporal !8
163+
%incdec.ptr3 = getelementptr inbounds i40, i40* %ddst.addr, i64 4
164+
store i40 40, i40* %incdec.ptr2, align 4, !nontemporal !8
165+
%add = add nuw nsw i32 %i, 3
166+
%cmp = icmp ult i32 %i, 4092
167+
br i1 %cmp, label %for.body, label %for.cond.cleanup
168+
169+
for.cond.cleanup: ; preds = %for.body
170+
ret void
171+
}
172+
define void @test_i64_store(i64* nocapture %ddst) local_unnamed_addr #0 {
173+
; CHECK-LABEL: define void @test_i64_store(
174+
; CHECK-LABEL: vector.body:
175+
; CHECK: store <4 x i64> {{.*}} !nontemporal !0
176+
; CHECK: br
177+
;
178+
entry:
179+
br label %for.body
180+
181+
for.body: ; preds = %entry, %for.body
182+
%i = phi i32 [ 0, %entry ], [ %add, %for.body ]
183+
%ddst.addr = phi i64* [ %ddst, %entry ], [ %incdec.ptr, %for.body ]
184+
%incdec.ptr = getelementptr inbounds i64, i64* %ddst.addr, i64 1
185+
store i64 10, i64* %ddst.addr, align 4, !nontemporal !8
186+
%add = add nuw nsw i32 %i, 4
187+
%cmp = icmp ult i32 %i, 4092
188+
br i1 %cmp, label %for.body, label %for.cond.cleanup
189+
190+
for.cond.cleanup: ; preds = %for.body
191+
ret void
192+
}
193+
194+
define void @test_double_store(double* %ddst) {
195+
; CHECK-LABEL: define void @test_double_store(
196+
; CHECK-LABEL: vector.body:
197+
; CHECK: store <4 x double> {{.*}} !nontemporal !0
198+
; CHECK: br
199+
;
200+
entry:
201+
br label %for.body
202+
203+
for.body: ; preds = %entry, %for.body
204+
%i = phi i32 [ 0, %entry ], [ %add, %for.body ]
205+
%ddst.addr = phi double* [ %ddst, %entry ], [ %incdec.ptr, %for.body ]
206+
%incdec.ptr = getelementptr inbounds double, double* %ddst.addr, i64 1
207+
store double 10.0, double* %ddst.addr, align 4, !nontemporal !8
208+
%add = add nuw nsw i32 %i, 4
209+
%cmp = icmp ult i32 %i, 4092
210+
br i1 %cmp, label %for.body, label %for.cond.cleanup
211+
212+
for.cond.cleanup: ; preds = %for.body
213+
ret void
214+
}
215+
216+
define void @test_i128_store(i128* %ddst) {
217+
; CHECK-LABEL: define void @test_i128_store(
218+
; CHECK-LABEL: vector.body:
219+
; CHECK: store <4 x i128> {{.*}} !nontemporal !0
220+
; CHECK: br
221+
;
222+
entry:
223+
br label %for.body
224+
225+
for.body: ; preds = %entry, %for.body
226+
%i = phi i32 [ 0, %entry ], [ %add, %for.body ]
227+
%ddst.addr = phi i128* [ %ddst, %entry ], [ %incdec.ptr, %for.body ]
228+
%incdec.ptr = getelementptr inbounds i128, i128* %ddst.addr, i64 1
229+
store i128 10, i128* %ddst.addr, align 4, !nontemporal !8
230+
%add = add nuw nsw i32 %i, 4
231+
%cmp = icmp ult i32 %i, 4092
232+
br i1 %cmp, label %for.body, label %for.cond.cleanup
233+
234+
for.cond.cleanup: ; preds = %for.body
235+
ret void
236+
}
237+
238+
define void @test_i256_store(i256* %ddst) {
239+
; CHECK-LABEL: define void @test_i256_store(
240+
; CHECK-NOT: vector.body:
241+
; CHECK: ret void
242+
;
243+
entry:
244+
br label %for.body
245+
246+
for.body: ; preds = %entry, %for.body
247+
%i = phi i32 [ 0, %entry ], [ %add, %for.body ]
248+
%ddst.addr = phi i256* [ %ddst, %entry ], [ %incdec.ptr, %for.body ]
249+
%incdec.ptr = getelementptr inbounds i256, i256* %ddst.addr, i64 1
250+
store i256 10, i256* %ddst.addr, align 4, !nontemporal !8
251+
%add = add nuw nsw i32 %i, 4
252+
%cmp = icmp ult i32 %i, 4092
253+
br i1 %cmp, label %for.body, label %for.cond.cleanup
254+
255+
for.cond.cleanup: ; preds = %for.body
256+
ret void
257+
}
258+
259+
!8 = !{i32 1}

0 commit comments

Comments
 (0)