@@ -36,7 +36,7 @@ target triple = "aarch64--linux-gnu"
36
36
; YAML-NEXT: Function: getelementptr_4x32
37
37
; YAML-NEXT: Args:
38
38
; YAML-NEXT: - String: 'SLP vectorized with cost '
39
- ; YAML-NEXT: - Cost: '6 '
39
+ ; YAML-NEXT: - Cost: '16 '
40
40
; YAML-NEXT: - String: ' and with tree size '
41
41
; YAML-NEXT: - TreeSize: '3'
42
42
@@ -46,50 +46,49 @@ define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %
46
46
; CHECK-NEXT: [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
47
47
; CHECK-NEXT: br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
48
48
; CHECK: for.body.preheader:
49
- ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 undef>, i32 [[X:%.*]], i32 1
50
- ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef , i32 [[Y:%.*]], i32 0
51
- ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[Z:%.*]], i32 1
49
+ ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 undef, i32 undef , i32 undef>, i32 [[X:%.*]], i32 1
50
+ ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]] , i32 [[Y:%.*]], i32 2
51
+ ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[Z:%.*]], i32 3
52
52
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
53
53
; CHECK: for.cond.cleanup.loopexit:
54
- ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP22 :%.*]], i32 1
54
+ ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP21 :%.*]], i32 1
55
55
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
56
56
; CHECK: for.cond.cleanup:
57
57
; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
58
58
; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
59
59
; CHECK: for.body:
60
- ; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP22 ]], [[FOR_BODY]] ]
60
+ ; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP21 ]], [[FOR_BODY]] ]
61
61
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0
62
62
; CHECK-NEXT: [[T4:%.*]] = shl nsw i32 [[TMP5]], 1
63
- ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[T4]], i32 0
64
- ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> undef, <2 x i32> zeroinitializer
65
- ; CHECK-NEXT: [[TMP8:%.*]] = add nsw <2 x i32> [[TMP7]], [[TMP0 ]]
66
- ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0
63
+ ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> undef, i32 [[T4]], i32 0
64
+ ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> zeroinitializer
65
+ ; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], [[TMP2 ]]
66
+ ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0
67
67
; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
68
68
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[G:%.*]], i64 [[TMP10]]
69
69
; CHECK-NEXT: [[T6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
70
70
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1
71
71
; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[T6]], [[TMP11]]
72
- ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1
72
+ ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1
73
73
; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
74
74
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP13]]
75
75
; CHECK-NEXT: [[T8:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
76
76
; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
77
- ; CHECK-NEXT: [[TMP14:%.*]] = add nsw <2 x i32> [[TMP7]], [[TMP2]]
78
- ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0
79
- ; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64
80
- ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP16]]
77
+ ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2
78
+ ; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
79
+ ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP15]]
81
80
; CHECK-NEXT: [[T10:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4
82
81
; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
83
- ; CHECK-NEXT: [[TMP17 :%.*]] = extractelement <2 x i32> [[TMP14 ]], i32 1
84
- ; CHECK-NEXT: [[TMP18 :%.*]] = sext i32 [[TMP17 ]] to i64
85
- ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP18 ]]
82
+ ; CHECK-NEXT: [[TMP16 :%.*]] = extractelement <4 x i32> [[TMP8 ]], i32 3
83
+ ; CHECK-NEXT: [[TMP17 :%.*]] = sext i32 [[TMP16 ]] to i64
84
+ ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP17 ]]
86
85
; CHECK-NEXT: [[T12:%.*]] = load i32, i32* [[ARRAYIDX15]], align 4
87
- ; CHECK-NEXT: [[TMP19 :%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i32 0
88
- ; CHECK-NEXT: [[TMP20 :%.*]] = insertelement <2 x i32> [[TMP19 ]], i32 [[ADD11]], i32 1
89
- ; CHECK-NEXT: [[TMP21 :%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[T12]], i32 1
90
- ; CHECK-NEXT: [[TMP22 ]] = add nsw <2 x i32> [[TMP20 ]], [[TMP21 ]]
91
- ; CHECK-NEXT: [[TMP23 :%.*]] = extractelement <2 x i32> [[TMP22 ]], i32 0
92
- ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP23 ]], [[N]]
86
+ ; CHECK-NEXT: [[TMP18 :%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i32 0
87
+ ; CHECK-NEXT: [[TMP19 :%.*]] = insertelement <2 x i32> [[TMP18 ]], i32 [[ADD11]], i32 1
88
+ ; CHECK-NEXT: [[TMP20 :%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[T12]], i32 1
89
+ ; CHECK-NEXT: [[TMP21 ]] = add nsw <2 x i32> [[TMP19 ]], [[TMP20 ]]
90
+ ; CHECK-NEXT: [[TMP22 :%.*]] = extractelement <2 x i32> [[TMP21 ]], i32 0
91
+ ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP22 ]], [[N]]
93
92
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
94
93
;
95
94
entry:
@@ -235,3 +234,161 @@ for.body:
235
234
%exitcond = icmp eq i32 %indvars.iv.next , %n
236
235
br i1 %exitcond , label %for.cond.cleanup.loopexit , label %for.body
237
236
}
237
+
238
+ @global = internal global { i32* } zeroinitializer , align 8
239
+
240
+ ; Make sure we vectorize to maximize the load with when loading i16 and
241
+ ; extending it for compute operations.
242
+ define void @test_i16_extend (i16* %p.1 , i16* %p.2 , i32 %idx.i32 ) {
243
+ ; CHECK-LABEL: @test_i16_extend(
244
+ ; CHECK-NEXT: [[P_0:%.*]] = load i32*, i32** getelementptr inbounds ({ i32* }, { i32* }* @global, i64 0, i32 0), align 8
245
+ ; CHECK-NEXT: [[IDX_0:%.*]] = zext i32 [[IDX_I32:%.*]] to i64
246
+ ; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds i16, i16* [[P_1:%.*]], i64 [[IDX_0]]
247
+ ; CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds i16, i16* [[P_2:%.*]], i64 [[IDX_0]]
248
+ ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP53]] to <8 x i16>*
249
+ ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
250
+ ; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
251
+ ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP56]] to <8 x i16>*
252
+ ; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 2
253
+ ; CHECK-NEXT: [[TMP6:%.*]] = zext <8 x i16> [[TMP5]] to <8 x i32>
254
+ ; CHECK-NEXT: [[TMP7:%.*]] = sub nsw <8 x i32> [[TMP3]], [[TMP6]]
255
+ ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP7]], i32 0
256
+ ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
257
+ ; CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[P_0]], i64 [[TMP9]]
258
+ ; CHECK-NEXT: [[L_1:%.*]] = load i32, i32* [[TMP60]], align 4
259
+ ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP7]], i32 1
260
+ ; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
261
+ ; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[P_0]], i64 [[TMP11]]
262
+ ; CHECK-NEXT: [[L_2:%.*]] = load i32, i32* [[TMP71]], align 4
263
+ ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[TMP7]], i32 2
264
+ ; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
265
+ ; CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds i32, i32* [[P_0]], i64 [[TMP13]]
266
+ ; CHECK-NEXT: [[L_3:%.*]] = load i32, i32* [[TMP82]], align 4
267
+ ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP7]], i32 3
268
+ ; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
269
+ ; CHECK-NEXT: [[TMP93:%.*]] = getelementptr inbounds i32, i32* [[P_0]], i64 [[TMP15]]
270
+ ; CHECK-NEXT: [[L_4:%.*]] = load i32, i32* [[TMP93]], align 4
271
+ ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP7]], i32 4
272
+ ; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
273
+ ; CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds i32, i32* [[P_0]], i64 [[TMP17]]
274
+ ; CHECK-NEXT: [[L_5:%.*]] = load i32, i32* [[TMP104]], align 4
275
+ ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[TMP7]], i32 5
276
+ ; CHECK-NEXT: [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
277
+ ; CHECK-NEXT: [[TMP115:%.*]] = getelementptr inbounds i32, i32* [[P_0]], i64 [[TMP19]]
278
+ ; CHECK-NEXT: [[L_6:%.*]] = load i32, i32* [[TMP115]], align 4
279
+ ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP7]], i32 6
280
+ ; CHECK-NEXT: [[TMP21:%.*]] = sext i32 [[TMP20]] to i64
281
+ ; CHECK-NEXT: [[TMP126:%.*]] = getelementptr inbounds i32, i32* [[P_0]], i64 [[TMP21]]
282
+ ; CHECK-NEXT: [[L_7:%.*]] = load i32, i32* [[TMP126]], align 4
283
+ ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP7]], i32 7
284
+ ; CHECK-NEXT: [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
285
+ ; CHECK-NEXT: [[TMP137:%.*]] = getelementptr inbounds i32, i32* [[P_0]], i64 [[TMP23]]
286
+ ; CHECK-NEXT: [[L_8:%.*]] = load i32, i32* [[TMP137]], align 4
287
+ ; CHECK-NEXT: call void @use(i32 [[L_1]], i32 [[L_2]], i32 [[L_3]], i32 [[L_4]], i32 [[L_5]], i32 [[L_6]], i32 [[L_7]], i32 [[L_8]])
288
+ ; CHECK-NEXT: ret void
289
+ ;
290
+ %g = getelementptr inbounds { i32* }, { i32 *}* @global , i64 0 , i32 0
291
+ %p.0 = load i32* , i32** %g , align 8
292
+
293
+ %idx.0 = zext i32 %idx.i32 to i64
294
+ %idx.1 = add nsw i64 %idx.0 , 1
295
+ %idx.2 = add nsw i64 %idx.0 , 2
296
+ %idx.3 = add nsw i64 %idx.0 , 3
297
+ %idx.4 = add nsw i64 %idx.0 , 4
298
+ %idx.5 = add nsw i64 %idx.0 , 5
299
+ %idx.6 = add nsw i64 %idx.0 , 6
300
+ %idx.7 = add nsw i64 %idx.0 , 7
301
+
302
+ %tmp53 = getelementptr inbounds i16 , i16* %p.1 , i64 %idx.0
303
+ %op1.l = load i16 , i16* %tmp53 , align 2
304
+ %op1.ext = zext i16 %op1.l to i64
305
+ %tmp56 = getelementptr inbounds i16 , i16* %p.2 , i64 %idx.0
306
+ %op2.l = load i16 , i16* %tmp56 , align 2
307
+ %op2.ext = zext i16 %op2.l to i64
308
+ %sub.1 = sub nsw i64 %op1.ext , %op2.ext
309
+
310
+ %tmp60 = getelementptr inbounds i32 , i32* %p.0 , i64 %sub.1
311
+ %l.1 = load i32 , i32* %tmp60 , align 4
312
+
313
+ %tmp64 = getelementptr inbounds i16 , i16* %p.1 , i64 %idx.1
314
+ %tmp65 = load i16 , i16* %tmp64 , align 2
315
+ %tmp66 = zext i16 %tmp65 to i64
316
+ %tmp67 = getelementptr inbounds i16 , i16* %p.2 , i64 %idx.1
317
+ %tmp68 = load i16 , i16* %tmp67 , align 2
318
+ %tmp69 = zext i16 %tmp68 to i64
319
+ %sub.2 = sub nsw i64 %tmp66 , %tmp69
320
+
321
+ %tmp71 = getelementptr inbounds i32 , i32* %p.0 , i64 %sub.2
322
+ %l.2 = load i32 , i32* %tmp71 , align 4
323
+
324
+ %tmp75 = getelementptr inbounds i16 , i16* %p.1 , i64 %idx.2
325
+ %tmp76 = load i16 , i16* %tmp75 , align 2
326
+ %tmp77 = zext i16 %tmp76 to i64
327
+ %tmp78 = getelementptr inbounds i16 , i16* %p.2 , i64 %idx.2
328
+ %tmp79 = load i16 , i16* %tmp78 , align 2
329
+ %tmp80 = zext i16 %tmp79 to i64
330
+ %sub.3 = sub nsw i64 %tmp77 , %tmp80
331
+
332
+ %tmp82 = getelementptr inbounds i32 , i32* %p.0 , i64 %sub.3
333
+ %l.3 = load i32 , i32* %tmp82 , align 4
334
+
335
+ %tmp86 = getelementptr inbounds i16 , i16* %p.1 , i64 %idx.3
336
+ %tmp87 = load i16 , i16* %tmp86 , align 2
337
+ %tmp88 = zext i16 %tmp87 to i64
338
+ %tmp89 = getelementptr inbounds i16 , i16* %p.2 , i64 %idx.3
339
+ %tmp90 = load i16 , i16* %tmp89 , align 2
340
+ %tmp91 = zext i16 %tmp90 to i64
341
+ %sub.4 = sub nsw i64 %tmp88 , %tmp91
342
+
343
+ %tmp93 = getelementptr inbounds i32 , i32* %p.0 , i64 %sub.4
344
+ %l.4 = load i32 , i32* %tmp93 , align 4
345
+
346
+ %tmp97 = getelementptr inbounds i16 , i16* %p.1 , i64 %idx.4
347
+ %tmp98 = load i16 , i16* %tmp97 , align 2
348
+ %tmp99 = zext i16 %tmp98 to i64
349
+ %tmp100 = getelementptr inbounds i16 , i16* %p.2 , i64 %idx.4
350
+ %tmp101 = load i16 , i16* %tmp100 , align 2
351
+ %tmp102 = zext i16 %tmp101 to i64
352
+ %sub.5 = sub nsw i64 %tmp99 , %tmp102
353
+
354
+ %tmp104 = getelementptr inbounds i32 , i32* %p.0 , i64 %sub.5
355
+ %l.5 = load i32 , i32* %tmp104 , align 4
356
+
357
+ %tmp108 = getelementptr inbounds i16 , i16* %p.1 , i64 %idx.5
358
+ %tmp109 = load i16 , i16* %tmp108 , align 2
359
+ %tmp110 = zext i16 %tmp109 to i64
360
+ %tmp111 = getelementptr inbounds i16 , i16* %p.2 , i64 %idx.5
361
+ %tmp112 = load i16 , i16* %tmp111 , align 2
362
+ %tmp113 = zext i16 %tmp112 to i64
363
+ %sub.6 = sub nsw i64 %tmp110 , %tmp113
364
+
365
+ %tmp115 = getelementptr inbounds i32 , i32* %p.0 , i64 %sub.6
366
+ %l.6 = load i32 , i32* %tmp115 , align 4
367
+
368
+ %tmp119 = getelementptr inbounds i16 , i16* %p.1 , i64 %idx.6
369
+ %tmp120 = load i16 , i16* %tmp119 , align 2
370
+ %tmp121 = zext i16 %tmp120 to i64
371
+ %tmp122 = getelementptr inbounds i16 , i16* %p.2 , i64 %idx.6
372
+ %tmp123 = load i16 , i16* %tmp122 , align 2
373
+ %tmp124 = zext i16 %tmp123 to i64
374
+ %sub.7 = sub nsw i64 %tmp121 , %tmp124
375
+
376
+ %tmp126 = getelementptr inbounds i32 , i32* %p.0 , i64 %sub.7
377
+ %l.7 = load i32 , i32* %tmp126 , align 4
378
+
379
+ %tmp130 = getelementptr inbounds i16 , i16* %p.1 , i64 %idx.7
380
+ %tmp131 = load i16 , i16* %tmp130 , align 2
381
+ %tmp132 = zext i16 %tmp131 to i64
382
+ %tmp133 = getelementptr inbounds i16 , i16* %p.2 , i64 %idx.7
383
+ %tmp134 = load i16 , i16* %tmp133 , align 2
384
+ %tmp135 = zext i16 %tmp134 to i64
385
+ %sub.8 = sub nsw i64 %tmp132 , %tmp135
386
+
387
+ %tmp137 = getelementptr inbounds i32 , i32* %p.0 , i64 %sub.8
388
+ %l.8 = load i32 , i32* %tmp137 , align 4
389
+
390
+ call void @use (i32 %l.1 , i32 %l.2 , i32 %l.3 , i32 %l.4 , i32 %l.5 , i32 %l.6 , i32 %l.7 , i32 %l.8 )
391
+ ret void
392
+ }
393
+
394
+ declare void @use (i32 , i32 , i32 , i32 , i32 , i32 , i32 , i32 )
0 commit comments