Skip to content

Commit ea7897a

Browse files
authored
[WebAssembly] Enable interleaved memory accesses (#125696)
Enable the vectorizer to access interleaved memory. This means that, when it's decided to be profitable, the memory accesses can be vectorized instead of the value being built up by a sequence of load_lane instructions. This will often increase the vectorization factor of the loop, leading to significantly better performance. I run a reasonably large collection of benchmarks and most are not affected by this change, with most performance changes <1%. But I see a 2.5% speedup for the total run time of TSVC, 1% speedup for SPEC2017 x265, 28% speedup for a ResNet workload and 95% for libyuv. This is running V8 on an AArch64 box.
1 parent 948a847 commit ea7897a

File tree

2 files changed

+363
-0
lines changed

2 files changed

+363
-0
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
5757
/// \name Vector TTI Implementations
5858
/// @{
5959

60+
bool enableInterleavedAccessVectorization() { return true; }
61+
6062
unsigned getNumberOfRegisters(unsigned ClassID) const;
6163
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
6264
InstructionCost getArithmeticInstrCost(
Lines changed: 361 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,361 @@
1+
; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
2+
3+
target triple = "wasm32"
4+
target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-i128:128-n32:64-S128-ni:1:10:20"
5+
6+
%struct.Output32x2 = type { i32, i32 }
7+
%struct.Input8x2 = type { i8, i8 }
8+
%struct.Output32x4 = type { i32, i32, i32, i32 }
9+
%struct.Input8x4 = type { i8, i8, i8, i8 }
10+
%struct.Input16x2 = type { i16, i16 }
11+
%struct.Input16x4 = type { i16, i16, i16, i16 }
12+
%struct.Input32x2 = type { i32, i32 }
13+
%struct.Input32x4 = type { i32, i32, i32, i32 }
14+
15+
; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
16+
define hidden void @accumulate8x2(ptr dead_on_unwind noalias writable sret(%struct.Output32x2) align 4 captures(none) %0, ptr noundef readonly captures(none) %1, i32 noundef %2) local_unnamed_addr #0 {
17+
; CHECK-LABEL: accumulate8x2:
18+
; CHECK: loop
19+
; CHECK: v128.load64_zero
20+
; CHECK: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
21+
; CHECK: i16x8.extend_low_i8x16_u
22+
; CHECK: i32x4.extend_low_i16x8_u
23+
; CHECK: i32x4.add
24+
; CHECK: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
25+
; CHECK: i16x8.extend_low_i8x16_u
26+
; CHECK: i32x4.extend_low_i16x8_u
27+
; CHECK: i32x4.add
28+
%4 = load i32, ptr %0, align 4
29+
%5 = icmp eq i32 %2, 0
30+
br i1 %5, label %10, label %6
31+
32+
6: ; preds = %3
33+
%7 = getelementptr inbounds nuw i8, ptr %0, i32 4
34+
%8 = load i32, ptr %7, align 4
35+
br label %12
36+
37+
9: ; preds = %12
38+
store i32 %23, ptr %7, align 4
39+
br label %10
40+
41+
10: ; preds = %9, %3
42+
%11 = phi i32 [ %21, %9 ], [ %4, %3 ]
43+
store i32 %11, ptr %0, align 4
44+
ret void
45+
46+
12: ; preds = %6, %12
47+
%13 = phi i32 [ %8, %6 ], [ %23, %12 ]
48+
%14 = phi i32 [ 0, %6 ], [ %24, %12 ]
49+
%15 = phi i32 [ %4, %6 ], [ %21, %12 ]
50+
%16 = getelementptr inbounds nuw %struct.Input8x2, ptr %1, i32 %14
51+
%17 = load i8, ptr %16, align 1
52+
%18 = getelementptr inbounds nuw i8, ptr %16, i32 1
53+
%19 = load i8, ptr %18, align 1
54+
%20 = zext i8 %17 to i32
55+
%21 = add i32 %15, %20
56+
%22 = zext i8 %19 to i32
57+
%23 = add i32 %13, %22
58+
%24 = add nuw i32 %14, 1
59+
%25 = icmp eq i32 %24, %2
60+
br i1 %25, label %9, label %12
61+
}
62+
63+
; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
64+
define hidden void @accumulate8x4(ptr dead_on_unwind noalias writable sret(%struct.Output32x4) align 4 captures(none) %0, ptr noundef readonly captures(none) %1, i32 noundef %2) local_unnamed_addr #0 {
65+
; CHECK-LABEL: accumulate8x4
66+
; CHECK: loop
67+
; CHECK: v128.load
68+
; CHECK: i8x16.shuffle 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
69+
; CHECK: i16x8.extend_low_i8x16_u
70+
; CHECK: i32x4.extend_low_i16x8_u
71+
; CHECK: i32x4.add
72+
; CHECK: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
73+
; CHECK: i16x8.extend_low_i8x16_u
74+
; CHECK: i32x4.extend_low_i16x8_u
75+
; CHECK: i32x4.add
76+
; CHECK: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
77+
; CHECK: i16x8.extend_low_i8x16_u
78+
; CHECK: i32x4.extend_low_i16x8_u
79+
; CHECK: i32x4.add
80+
; CHECK: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
81+
; CHECK: i16x8.extend_low_i8x16_u
82+
; CHECK: i32x4.extend_low_i16x8_u
83+
; CHECK: i32x4.add
84+
%4 = load i32, ptr %0, align 4
85+
%5 = icmp eq i32 %2, 0
86+
br i1 %5, label %14, label %6
87+
88+
6: ; preds = %3
89+
%7 = getelementptr inbounds nuw i8, ptr %0, i32 4
90+
%8 = getelementptr inbounds nuw i8, ptr %0, i32 8
91+
%9 = getelementptr inbounds nuw i8, ptr %0, i32 12
92+
%10 = load i32, ptr %7, align 4
93+
%11 = load i32, ptr %8, align 4
94+
%12 = load i32, ptr %9, align 4
95+
br label %16
96+
97+
13: ; preds = %16
98+
store i32 %33, ptr %7, align 4
99+
store i32 %35, ptr %8, align 4
100+
store i32 %37, ptr %9, align 4
101+
br label %14
102+
103+
14: ; preds = %13, %3
104+
%15 = phi i32 [ %31, %13 ], [ %4, %3 ]
105+
store i32 %15, ptr %0, align 4
106+
ret void
107+
108+
16: ; preds = %6, %16
109+
%17 = phi i32 [ %12, %6 ], [ %37, %16 ]
110+
%18 = phi i32 [ %11, %6 ], [ %35, %16 ]
111+
%19 = phi i32 [ %10, %6 ], [ %33, %16 ]
112+
%20 = phi i32 [ 0, %6 ], [ %38, %16 ]
113+
%21 = phi i32 [ %4, %6 ], [ %31, %16 ]
114+
%22 = getelementptr inbounds nuw %struct.Input8x4, ptr %1, i32 %20
115+
%23 = load i8, ptr %22, align 1
116+
%24 = getelementptr inbounds nuw i8, ptr %22, i32 1
117+
%25 = load i8, ptr %24, align 1
118+
%26 = getelementptr inbounds nuw i8, ptr %22, i32 2
119+
%27 = load i8, ptr %26, align 1
120+
%28 = getelementptr inbounds nuw i8, ptr %22, i32 3
121+
%29 = load i8, ptr %28, align 1
122+
%30 = zext i8 %23 to i32
123+
%31 = add i32 %21, %30
124+
%32 = zext i8 %25 to i32
125+
%33 = add i32 %19, %32
126+
%34 = zext i8 %27 to i32
127+
%35 = add i32 %18, %34
128+
%36 = zext i8 %29 to i32
129+
%37 = add i32 %17, %36
130+
%38 = add nuw i32 %20, 1
131+
%39 = icmp eq i32 %38, %2
132+
br i1 %39, label %13, label %16
133+
}
134+
135+
; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
136+
define hidden void @accumulate16x2(ptr dead_on_unwind noalias writable sret(%struct.Output32x2) align 4 captures(none) %0, ptr noundef readonly captures(none) %1, i32 noundef %2) local_unnamed_addr #0 {
137+
; CHECK-LABEL: accumulate16x2
138+
; CHECK: loop
139+
; CHECK: v128.load
140+
; CHECK: i8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
141+
; CHECK: i32x4.extend_low_i16x8_u
142+
; CHECK: i32x4.add
143+
; CHECK: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
144+
; CHECK: i32x4.extend_low_i16x8_u
145+
; CHECK: i32x4.add
146+
%4 = load i32, ptr %0, align 4
147+
%5 = icmp eq i32 %2, 0
148+
br i1 %5, label %10, label %6
149+
150+
6: ; preds = %3
151+
%7 = getelementptr inbounds nuw i8, ptr %0, i32 4
152+
%8 = load i32, ptr %7, align 4
153+
br label %12
154+
155+
9: ; preds = %12
156+
store i32 %23, ptr %7, align 4
157+
br label %10
158+
159+
10: ; preds = %9, %3
160+
%11 = phi i32 [ %21, %9 ], [ %4, %3 ]
161+
store i32 %11, ptr %0, align 4
162+
ret void
163+
164+
12: ; preds = %6, %12
165+
%13 = phi i32 [ %8, %6 ], [ %23, %12 ]
166+
%14 = phi i32 [ 0, %6 ], [ %24, %12 ]
167+
%15 = phi i32 [ %4, %6 ], [ %21, %12 ]
168+
%16 = getelementptr inbounds nuw %struct.Input16x2, ptr %1, i32 %14
169+
%17 = load i16, ptr %16, align 2
170+
%18 = getelementptr inbounds nuw i8, ptr %16, i32 2
171+
%19 = load i16, ptr %18, align 2
172+
%20 = zext i16 %17 to i32
173+
%21 = add i32 %15, %20
174+
%22 = zext i16 %19 to i32
175+
%23 = add i32 %13, %22
176+
%24 = add nuw i32 %14, 1
177+
%25 = icmp eq i32 %24, %2
178+
br i1 %25, label %9, label %12
179+
}
180+
181+
; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
182+
define hidden void @accumulate16x4(ptr dead_on_unwind noalias writable sret(%struct.Output32x4) align 4 captures(none) %0, ptr noundef readonly captures(none) %1, i32 noundef %2) local_unnamed_addr #0 {
183+
; CHECK-LABEL: accumulate16x4
184+
; CHECK: loop
185+
; CHECK: v128.load 0:p2align=1
186+
; CHECK: v128.load 16:p2align=1
187+
; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
188+
; CHECK: i32x4.extend_low_i16x8_u
189+
; CHECK: i32x4.add
190+
; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
191+
; CHECK: i32x4.extend_low_i16x8_u
192+
; CHECK: i32x4.add
193+
; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
194+
; CHECK: i32x4.extend_low_i16x8_u
195+
; CHECK: i32x4.add
196+
; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
197+
; CHECK: i32x4.extend_low_i16x8_u
198+
; CHECK: i32x4.add
199+
%4 = load i32, ptr %0, align 4
200+
%5 = icmp eq i32 %2, 0
201+
br i1 %5, label %14, label %6
202+
203+
6: ; preds = %3
204+
%7 = getelementptr inbounds nuw i8, ptr %0, i32 4
205+
%8 = getelementptr inbounds nuw i8, ptr %0, i32 8
206+
%9 = getelementptr inbounds nuw i8, ptr %0, i32 12
207+
%10 = load i32, ptr %7, align 4
208+
%11 = load i32, ptr %8, align 4
209+
%12 = load i32, ptr %9, align 4
210+
br label %16
211+
212+
13: ; preds = %16
213+
store i32 %33, ptr %7, align 4
214+
store i32 %35, ptr %8, align 4
215+
store i32 %37, ptr %9, align 4
216+
br label %14
217+
218+
14: ; preds = %13, %3
219+
%15 = phi i32 [ %31, %13 ], [ %4, %3 ]
220+
store i32 %15, ptr %0, align 4
221+
ret void
222+
223+
16: ; preds = %6, %16
224+
%17 = phi i32 [ %12, %6 ], [ %37, %16 ]
225+
%18 = phi i32 [ %11, %6 ], [ %35, %16 ]
226+
%19 = phi i32 [ %10, %6 ], [ %33, %16 ]
227+
%20 = phi i32 [ 0, %6 ], [ %38, %16 ]
228+
%21 = phi i32 [ %4, %6 ], [ %31, %16 ]
229+
%22 = getelementptr inbounds nuw %struct.Input16x4, ptr %1, i32 %20
230+
%23 = load i16, ptr %22, align 2
231+
%24 = getelementptr inbounds nuw i8, ptr %22, i32 2
232+
%25 = load i16, ptr %24, align 2
233+
%26 = getelementptr inbounds nuw i8, ptr %22, i32 4
234+
%27 = load i16, ptr %26, align 2
235+
%28 = getelementptr inbounds nuw i8, ptr %22, i32 6
236+
%29 = load i16, ptr %28, align 2
237+
%30 = zext i16 %23 to i32
238+
%31 = add i32 %21, %30
239+
%32 = zext i16 %25 to i32
240+
%33 = add i32 %19, %32
241+
%34 = zext i16 %27 to i32
242+
%35 = add i32 %18, %34
243+
%36 = zext i16 %29 to i32
244+
%37 = add i32 %17, %36
245+
%38 = add nuw i32 %20, 1
246+
%39 = icmp eq i32 %38, %2
247+
br i1 %39, label %13, label %16
248+
}
249+
250+
; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
251+
define hidden void @accumulate32x2(ptr dead_on_unwind noalias writable sret(%struct.Output32x2) align 4 captures(none) %0, ptr noundef readonly captures(none) %1, i32 noundef %2) local_unnamed_addr #0 {
252+
; CHECK-LABEL: accumulate32x2
253+
; CHECK: loop
254+
; CHECK: v128.load 0:p2align=2
255+
; CHECK: v128.load 16:p2align=2
256+
; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
257+
; CHECK: i32x4.add
258+
; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
259+
; CHECK: i32x4.add
260+
%4 = load i32, ptr %0, align 4
261+
%5 = icmp eq i32 %2, 0
262+
br i1 %5, label %10, label %6
263+
264+
6: ; preds = %3
265+
%7 = getelementptr inbounds nuw i8, ptr %0, i32 4
266+
%8 = load i32, ptr %7, align 4
267+
br label %12
268+
269+
9: ; preds = %12
270+
store i32 %21, ptr %7, align 4
271+
br label %10
272+
273+
10: ; preds = %9, %3
274+
%11 = phi i32 [ %20, %9 ], [ %4, %3 ]
275+
store i32 %11, ptr %0, align 4
276+
ret void
277+
278+
12: ; preds = %6, %12
279+
%13 = phi i32 [ %8, %6 ], [ %21, %12 ]
280+
%14 = phi i32 [ 0, %6 ], [ %22, %12 ]
281+
%15 = phi i32 [ %4, %6 ], [ %20, %12 ]
282+
%16 = getelementptr inbounds nuw %struct.Input32x2, ptr %1, i32 %14
283+
%17 = load i32, ptr %16, align 4
284+
%18 = getelementptr inbounds nuw i8, ptr %16, i32 4
285+
%19 = load i32, ptr %18, align 4
286+
%20 = add i32 %15, %17
287+
%21 = add i32 %13, %19
288+
%22 = add nuw i32 %14, 1
289+
%23 = icmp eq i32 %22, %2
290+
br i1 %23, label %9, label %12
291+
}
292+
293+
; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
294+
define hidden void @accumulate32x4(ptr dead_on_unwind noalias writable sret(%struct.Output32x4) align 4 captures(none) %0, ptr noundef readonly captures(none) %1, i32 noundef %2) local_unnamed_addr #0 {
295+
; CHECK-LABEL: accumulate32x4
296+
; CHECK: v128.load 0:p2align=2
297+
; CHECK: v128.load 16:p2align=2
298+
; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
299+
; CHECK: v128.load 32:p2align=2
300+
; CHECK: v128.load 48:p2align=2
301+
; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
302+
; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
303+
; CHECK: i32x4.add
304+
; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
305+
; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
306+
; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
307+
; CHECK: i32x4.add
308+
; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
309+
; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
310+
; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
311+
; CHECK: i32x4.add
312+
; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
313+
; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
314+
; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
315+
; CHECK: i32x4.add
316+
%4 = load i32, ptr %0, align 4
317+
%5 = icmp eq i32 %2, 0
318+
br i1 %5, label %14, label %6
319+
320+
6: ; preds = %3
321+
%7 = getelementptr inbounds nuw i8, ptr %0, i32 4
322+
%8 = getelementptr inbounds nuw i8, ptr %0, i32 8
323+
%9 = getelementptr inbounds nuw i8, ptr %0, i32 12
324+
%10 = load i32, ptr %7, align 4
325+
%11 = load i32, ptr %8, align 4
326+
%12 = load i32, ptr %9, align 4
327+
br label %16
328+
329+
13: ; preds = %16
330+
store i32 %31, ptr %7, align 4
331+
store i32 %32, ptr %8, align 4
332+
store i32 %33, ptr %9, align 4
333+
br label %14
334+
335+
14: ; preds = %13, %3
336+
%15 = phi i32 [ %30, %13 ], [ %4, %3 ]
337+
store i32 %15, ptr %0, align 4
338+
ret void
339+
340+
16: ; preds = %6, %16
341+
%17 = phi i32 [ %12, %6 ], [ %33, %16 ]
342+
%18 = phi i32 [ %11, %6 ], [ %32, %16 ]
343+
%19 = phi i32 [ %10, %6 ], [ %31, %16 ]
344+
%20 = phi i32 [ 0, %6 ], [ %34, %16 ]
345+
%21 = phi i32 [ %4, %6 ], [ %30, %16 ]
346+
%22 = getelementptr inbounds nuw %struct.Input32x4, ptr %1, i32 %20
347+
%23 = load i32, ptr %22, align 4
348+
%24 = getelementptr inbounds nuw i8, ptr %22, i32 4
349+
%25 = load i32, ptr %24, align 4
350+
%26 = getelementptr inbounds nuw i8, ptr %22, i32 8
351+
%27 = load i32, ptr %26, align 4
352+
%28 = getelementptr inbounds nuw i8, ptr %22, i32 12
353+
%29 = load i32, ptr %28, align 4
354+
%30 = add i32 %21, %23
355+
%31 = add i32 %19, %25
356+
%32 = add i32 %18, %27
357+
%33 = add i32 %17, %29
358+
%34 = add nuw i32 %20, 1
359+
%35 = icmp eq i32 %34, %2
360+
br i1 %35, label %13, label %16
361+
}

0 commit comments

Comments
 (0)